mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into celery-tasks
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.models import Group, User
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.utils.safestring import mark_safe
|
||||
from whoosh.writing import AsyncWriter
|
||||
@@ -32,7 +31,7 @@ class TagAdmin(admin.ModelAdmin):
|
||||
list_filter = ("colour", "matching_algorithm")
|
||||
list_editable = ("colour", "match", "matching_algorithm")
|
||||
|
||||
readonly_fields = ("slug",)
|
||||
readonly_fields = ("slug", )
|
||||
|
||||
|
||||
class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
@@ -51,9 +50,17 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added", "file_type", "storage_type",)
|
||||
list_display = ("title", "created", "added", "correspondent",
|
||||
"tags_", "archive_serial_number", "document_type")
|
||||
readonly_fields = ("added", "file_type", "storage_type", "filename")
|
||||
list_display = (
|
||||
"title",
|
||||
"created",
|
||||
"added",
|
||||
"correspondent",
|
||||
"tags_",
|
||||
"archive_serial_number",
|
||||
"document_type",
|
||||
"filename"
|
||||
)
|
||||
list_filter = (
|
||||
"document_type",
|
||||
"tags",
|
||||
@@ -120,8 +127,3 @@ admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(DocumentType, DocumentTypeAdmin)
|
||||
admin.site.register(Document, DocumentAdmin)
|
||||
admin.site.register(Log, LogAdmin)
|
||||
|
||||
|
||||
# Unless we implement multi-user, these default registrations don't make sense.
|
||||
admin.site.unregister(Group)
|
||||
admin.site.unregister(User)
|
||||
|
@@ -1,5 +1,4 @@
|
||||
from django.apps import AppConfig
|
||||
from django.db.models.signals import post_delete
|
||||
|
||||
|
||||
class DocumentsConfig(AppConfig):
|
||||
@@ -14,7 +13,6 @@ class DocumentsConfig(AppConfig):
|
||||
add_inbox_tags,
|
||||
run_pre_consume_script,
|
||||
run_post_consume_script,
|
||||
cleanup_document_deletion,
|
||||
set_log_entry,
|
||||
set_correspondent,
|
||||
set_document_type,
|
||||
@@ -33,6 +31,4 @@ class DocumentsConfig(AppConfig):
|
||||
document_consumption_finished.connect(add_to_index)
|
||||
document_consumption_finished.connect(run_post_consume_script)
|
||||
|
||||
post_delete.connect(cleanup_document_deletion)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@@ -4,6 +4,8 @@ from django.conf import settings
|
||||
from django.core.checks import Error, register
|
||||
from django.db.utils import OperationalError, ProgrammingError
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
|
||||
@register()
|
||||
def changed_password_check(app_configs, **kwargs):
|
||||
@@ -37,3 +39,17 @@ def changed_password_check(app_configs, **kwargs):
|
||||
"""))]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@register()
|
||||
def parser_check(app_configs, **kwargs):
|
||||
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
if len(parsers) == 0:
|
||||
return [Error("No parsers found. This is a bug. The consumer won't be "
|
||||
"able to onsume any documents without parsers.")]
|
||||
else:
|
||||
return []
|
||||
|
@@ -3,7 +3,6 @@ import logging
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import time
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
@@ -64,7 +63,7 @@ class DocumentClassifier(object):
|
||||
|
||||
def save_classifier(self):
|
||||
with open(settings.MODEL_FILE, "wb") as f:
|
||||
pickle.dump(self.FORMAT_VERSION, f) # Version
|
||||
pickle.dump(self.FORMAT_VERSION, f)
|
||||
pickle.dump(self.data_hash, f)
|
||||
pickle.dump(self.data_vectorizer, f)
|
||||
|
||||
@@ -89,16 +88,14 @@ class DocumentClassifier(object):
|
||||
data.append(preprocessed_content)
|
||||
|
||||
y = -1
|
||||
if doc.document_type:
|
||||
if doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.pk
|
||||
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_document_type.append(y)
|
||||
|
||||
y = -1
|
||||
if doc.correspondent:
|
||||
if doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.pk
|
||||
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_correspondent.append(y)
|
||||
|
||||
@@ -120,8 +117,8 @@ class DocumentClassifier(object):
|
||||
|
||||
num_tags = len(labels_tags_unique)
|
||||
# substract 1 since -1 (null) is also part of the classes.
|
||||
num_correspondents = len(labels_correspondent) - 1
|
||||
num_document_types = len(labels_document_type) - 1
|
||||
num_correspondents = len(set(labels_correspondent)) - 1
|
||||
num_document_types = len(set(labels_document_type)) - 1
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
@@ -137,7 +134,7 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).debug("Vectorizing data...")
|
||||
self.data_vectorizer = CountVectorizer(
|
||||
analyzer="word",
|
||||
ngram_range=(1,2),
|
||||
ngram_range=(1, 2),
|
||||
min_df=0.01
|
||||
)
|
||||
data_vectorized = self.data_vectorizer.fit_transform(data)
|
||||
|
@@ -3,7 +3,6 @@ import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
|
||||
from asgiref.sync import async_to_sync
|
||||
from channels.layers import get_channel_layer
|
||||
@@ -13,7 +12,9 @@ from django.utils import timezone
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .models import Document, FileInfo
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
@@ -25,17 +26,10 @@ class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class Consumer:
|
||||
"""
|
||||
Loop over every file found in CONSUMPTION_DIR and:
|
||||
1. Convert it to a greyscale pnm
|
||||
2. Use tesseract on the pnm
|
||||
3. Store the document in the MEDIA_ROOT with optional encryption
|
||||
4. Store the OCR'd text in the database
|
||||
5. Delete the document and image(s)
|
||||
"""
|
||||
class Consumer(LoggingMixin):
|
||||
|
||||
def _send_progress(self, filename, current_progress, max_progress, status, message, document_id=None):
|
||||
def _send_progress(self, filename, current_progress, max_progress, status,
|
||||
message, document_id=None):
|
||||
payload = {
|
||||
'filename': os.path.basename(filename),
|
||||
'current_progress': current_progress,
|
||||
@@ -44,156 +38,226 @@ class Consumer:
|
||||
'message': message,
|
||||
'document_id': document_id
|
||||
}
|
||||
async_to_sync(self.channel_layer.group_send)("status_updates", {'type': 'status_update', 'data': payload})
|
||||
async_to_sync(self.channel_layer.group_send)("status_updates",
|
||||
{'type': 'status_update',
|
||||
'data': payload})
|
||||
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR,
|
||||
scratch=settings.SCRATCH_DIR):
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
self.consume = consume
|
||||
self.scratch = scratch
|
||||
|
||||
self.classifier = DocumentClassifier()
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.path = None
|
||||
self.filename = None
|
||||
self.override_title = None
|
||||
self.override_correspondent_id = None
|
||||
self.override_tag_ids = None
|
||||
self.override_document_type_id = None
|
||||
|
||||
self.channel_layer = get_channel_layer()
|
||||
|
||||
os.makedirs(self.scratch, exist_ok=True)
|
||||
def pre_check_file_exists(self):
|
||||
if not os.path.isfile(self.path):
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
|
||||
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
self.storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
if not self.consume:
|
||||
def pre_check_consumption_dir(self):
|
||||
if not settings.CONSUMPTION_DIR:
|
||||
raise ConsumerError(
|
||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||
"set."
|
||||
)
|
||||
"set.")
|
||||
|
||||
if not os.path.exists(self.consume):
|
||||
if not os.path.isdir(settings.CONSUMPTION_DIR):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(self.consume))
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
def pre_check_regex(self):
|
||||
if not re.match(FileInfo.REGEXES["title"], self.filename):
|
||||
raise ConsumerError(
|
||||
"Filename {} does not seem to be safe to "
|
||||
"consume".format(self.filename))
|
||||
|
||||
@transaction.atomic
|
||||
def try_consume_file(self, file):
|
||||
"""
|
||||
Return True if file was consumed
|
||||
"""
|
||||
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
if not re.match(FileInfo.REGEXES["title"], file):
|
||||
return False
|
||||
|
||||
doc = file
|
||||
|
||||
if self._is_duplicate(doc):
|
||||
self.log(
|
||||
"warning",
|
||||
"Skipping {} as it appears to be a duplicate".format(doc)
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
"Not consuming {}: It is a duplicate.".format(self.filename)
|
||||
)
|
||||
return False
|
||||
|
||||
self.log("info", "Consuming {}".format(doc))
|
||||
def pre_check_directories(self):
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
override_filename=None,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
"""
|
||||
Return the document object if it was successfully created.
|
||||
"""
|
||||
|
||||
parser_class = get_parser_class(doc)
|
||||
self.path = path
|
||||
self.filename = override_filename or os.path.basename(path)
|
||||
self.override_title = override_title
|
||||
self.override_correspondent_id = override_correspondent_id
|
||||
self.override_document_type_id = override_document_type_id
|
||||
self.override_tag_ids = override_tag_ids
|
||||
|
||||
# this is for grouping logging entries for this particular file
|
||||
# together.
|
||||
|
||||
self.renew_logging_group()
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_regex()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
self.log("info", "Consuming {}".format(self.filename))
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
parser_class = get_parser_class(self.filename)
|
||||
if not parser_class:
|
||||
self.log(
|
||||
"error", "No parsers could be found for {}".format(doc))
|
||||
return False
|
||||
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||
else:
|
||||
self.log("info", "Parser: {}".format(parser_class.__name__))
|
||||
self.log("debug", "Parser: {}".format(parser_class.__name__))
|
||||
|
||||
self._send_progress(file, 0, 100, 'WORKING', 'Consumption started')
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
self._send_progress(self.filename, 0, 100, 'WORKING', 'Consumption started')
|
||||
|
||||
document_consumption_started.send(
|
||||
sender=self.__class__,
|
||||
filename=doc,
|
||||
filename=self.path,
|
||||
logging_group=self.logging_group
|
||||
)
|
||||
|
||||
def progress_callback(current_progress, max_progress, message):
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 60 + 20)
|
||||
self._send_progress(file, p, 100, "WORKING", message)
|
||||
self._send_progress(self.filename, p, 100, "WORKING", message)
|
||||
|
||||
document_parser = parser_class(doc, self.logging_group, progress_callback)
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.path, self.logging_group, progress_callback)
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("info", "Generating thumbnail for {}...".format(doc))
|
||||
self._send_progress(file, 10, 100, 'WORKING',
|
||||
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
|
||||
self._send_progress(self.filename, 10, 100, 'WORKING',
|
||||
'Generating thumbnail...')
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self._send_progress(file, 20, 100, 'WORKING',
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
self._send_progress(self.filename, 20, 100, 'WORKING',
|
||||
'Getting text from document...')
|
||||
text = document_parser.get_text()
|
||||
self._send_progress(file, 80, 100, 'WORKING',
|
||||
self._send_progress(self.filename, 80, 100, 'WORKING',
|
||||
'Getting date from document...')
|
||||
date = document_parser.get_date()
|
||||
self._send_progress(file, 85, 100, 'WORKING',
|
||||
'Storing the document...')
|
||||
document = self._store(
|
||||
text,
|
||||
doc,
|
||||
thumbnail,
|
||||
date
|
||||
)
|
||||
except ParseError as e:
|
||||
self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
|
||||
document_parser.cleanup()
|
||||
self._send_progress(self.filename, 100, 100, 'FAILED',
|
||||
"Failed: {}".format(e))
|
||||
raise ConsumerError(e)
|
||||
|
||||
# Prepare the document classifier.
|
||||
|
||||
# TODO: I don't really like to do this here, but this way we avoid
|
||||
# reloading the classifier multiple times, since there are multiple
|
||||
# post-consume hooks that all require the classifier.
|
||||
|
||||
try:
|
||||
classifier = DocumentClassifier()
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
"Cannot classify documents: {}.".format(e))
|
||||
classifier = None
|
||||
self._send_progress(self.filename, 85, 100, 'WORKING',
|
||||
'Storing the document...')
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
with transaction.atomic():
|
||||
|
||||
# store the document.
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
self._send_progress(self.filename, 90, 100, 'WORKING',
|
||||
'Performing post-consumption tasks...')
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
self._write(document, self.path, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
except Exception as e:
|
||||
raise ConsumerError(e)
|
||||
self._send_progress(file, 100, 100, 'FAILED',
|
||||
"Failed: {}".format(e))
|
||||
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
return False
|
||||
else:
|
||||
document_parser.cleanup()
|
||||
self._cleanup_doc(doc)
|
||||
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
self.log(
|
||||
"info",
|
||||
"Document {} consumption finished".format(document)
|
||||
)
|
||||
|
||||
classifier = None
|
||||
self._send_progress(file, 100, 100, 'SUCCESS',
|
||||
'Finished.', document.id)
|
||||
|
||||
try:
|
||||
self.classifier.reload()
|
||||
classifier = self.classifier
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
|
||||
return document
|
||||
|
||||
self._send_progress(file, 90, 100, 'WORKING',
|
||||
'Performing post-consumption tasks...')
|
||||
def _store(self, text, date):
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
logging_group=self.logging_group,
|
||||
classifier=classifier
|
||||
)
|
||||
self._send_progress(file, 100, 100, 'SUCCESS',
|
||||
'Finished.', document.id)
|
||||
return True
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
|
||||
def _store(self, text, doc, thumbnail, date):
|
||||
file_info = FileInfo.from_path(self.filename)
|
||||
|
||||
file_info = FileInfo.from_path(doc)
|
||||
|
||||
stats = os.stat(doc)
|
||||
stats = os.stat(self.path)
|
||||
|
||||
self.log("debug", "Saving record to database")
|
||||
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
with open(doc, "rb") as f:
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
else:
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
@@ -202,7 +266,7 @@ class Consumer:
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
created=created,
|
||||
modified=created,
|
||||
storage_type=self.storage_type
|
||||
storage_type=storage_type
|
||||
)
|
||||
|
||||
relevant_tags = set(file_info.tags)
|
||||
@@ -211,14 +275,30 @@ class Consumer:
|
||||
self.log("debug", "Tagging with {}".format(tag_names))
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
self._write(document, doc, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
self.apply_overrides(document)
|
||||
|
||||
#TODO: why do we need to save the document again?
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
# We need to save the document twice, since we need the PK of the
|
||||
# document in order to create its filename above.
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
if self.override_title:
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
|
||||
|
||||
if self.override_document_type_id:
|
||||
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
|
||||
|
||||
if self.override_tag_ids:
|
||||
for tag_id in self.override_tag_ids:
|
||||
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||
|
||||
def _write(self, document, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
@@ -227,13 +307,3 @@ class Consumer:
|
||||
return
|
||||
self.log("debug", "Encrypting")
|
||||
write_file.write(GnuPG.encrypted(read_file))
|
||||
|
||||
def _cleanup_doc(self, doc):
|
||||
self.log("debug", "Deleting document {}".format(doc))
|
||||
os.unlink(doc)
|
||||
|
||||
@staticmethod
|
||||
def _is_duplicate(doc):
|
||||
with open(doc, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
102
src/documents/file_handling.py
Normal file
102
src/documents/file_handling.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
|
||||
def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
|
||||
def delete_empty_directories(directory):
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(settings.ORIGINALS_DIR)
|
||||
|
||||
if not directory.startswith(root + os.path.sep):
|
||||
# don't do anything outside our originals folder.
|
||||
|
||||
# append os.path.set so that we avoid these cases:
|
||||
# directory = /home/originals2/test
|
||||
# root = /home/originals ("/" gets appended and startswith fails)
|
||||
return
|
||||
|
||||
while directory != root:
|
||||
if not os.listdir(directory):
|
||||
# it's empty
|
||||
try:
|
||||
os.rmdir(directory)
|
||||
except OSError:
|
||||
# whatever. empty directories aren't that bad anyway.
|
||||
return
|
||||
else:
|
||||
# it's not empty.
|
||||
return
|
||||
|
||||
# go one level up
|
||||
directory = os.path.normpath(os.path.dirname(directory))
|
||||
|
||||
|
||||
def many_to_dictionary(field):
|
||||
# Converts ManyToManyField to dictionary by assuming, that field
|
||||
# entries contain an _ or - which will be used as a delimiter
|
||||
mydictionary = dict()
|
||||
|
||||
for index, t in enumerate(field.all()):
|
||||
# Populate tag names by index
|
||||
mydictionary[index] = slugify(t.name)
|
||||
|
||||
# Find delimiter
|
||||
delimiter = t.name.find('_')
|
||||
|
||||
if delimiter == -1:
|
||||
delimiter = t.name.find('-')
|
||||
|
||||
if delimiter == -1:
|
||||
continue
|
||||
|
||||
key = t.name[:delimiter]
|
||||
value = t.name[delimiter + 1:]
|
||||
|
||||
mydictionary[slugify(key)] = slugify(value)
|
||||
|
||||
return mydictionary
|
||||
|
||||
|
||||
def generate_filename(document):
|
||||
# Create filename based on configured format
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(document.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(document.correspondent),
|
||||
title=slugify(document.title),
|
||||
created=slugify(document.created),
|
||||
created_year=document.created.year if document.created else "none",
|
||||
created_month=document.created.month if document.created else "none",
|
||||
created_day=document.created.day if document.created else "none",
|
||||
added=slugify(document.added),
|
||||
added_year=document.added.year if document.added else "none",
|
||||
added_month=document.added.month if document.added else "none",
|
||||
added_day=document.added.day if document.added else "none",
|
||||
tags=tags,
|
||||
)
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (document.pk, document.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if document.storage_type == document.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
@@ -1,10 +1,11 @@
|
||||
import os
|
||||
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
|
||||
@@ -19,12 +20,6 @@ class UploadForm(forms.Form):
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
return self.cleaned_data.get("document")
|
||||
|
||||
def get_filename(self, i=None):
|
||||
return os.path.join(
|
||||
settings.CONSUMPTION_DIR,
|
||||
"{}_{}".format(str(i), self.cleaned_data.get("document").name) if i else self.cleaned_data.get("document").name
|
||||
)
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
@@ -33,15 +28,16 @@ class UploadForm(forms.Form):
|
||||
"""
|
||||
|
||||
document = self.cleaned_data.get("document").read()
|
||||
original_filename = self.cleaned_data.get("document").name
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
file_name = self.get_filename()
|
||||
i = 0
|
||||
while os.path.exists(file_name):
|
||||
i += 1
|
||||
file_name = self.get_filename(i)
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
# TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
|
||||
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(document)
|
||||
os.utime(file_name, times=(t, t))
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from whoosh import highlight
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
@@ -9,10 +8,8 @@ from whoosh.index import create_in, exists_in, open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from paperless import settings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -69,6 +66,9 @@ def open_index(recreate=False):
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR)
|
||||
else:
|
||||
# TODO: this is not thread safe. If 2 instances try to create the index
|
||||
# at the same time, this fails. This currently prevents parallel
|
||||
# tests.
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@@ -99,15 +99,19 @@ def remove_document_from_index(document):
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def query_page(ix, query, page):
|
||||
with ix.searcher() as searcher:
|
||||
searcher = ix.searcher()
|
||||
try:
|
||||
query_parser = MultifieldParser(["content", "title", "correspondent"],
|
||||
ix.schema).parse(query)
|
||||
result_page = searcher.search_page(query_parser, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
return result_page
|
||||
yield result_page
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
||||
def autocomplete(ix, term, limit=10):
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
|
||||
class PaperlessHandler(logging.Handler):
|
||||
@@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
|
||||
kwargs["group"] = record.group
|
||||
|
||||
Log.objects.create(**kwargs)
|
||||
|
||||
|
||||
class LoggingMixin:
|
||||
|
||||
logging_group = None
|
||||
|
||||
def renew_logging_group(self):
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
target = ".".join([self.__class__.__module__, self.__class__.__name__])
|
||||
logger = logging.getLogger(target)
|
||||
|
||||
getattr(logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
@@ -1,250 +0,0 @@
|
||||
import datetime
|
||||
import imaplib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from base64 import b64decode
|
||||
from email import policy
|
||||
from email.parser import BytesParser
|
||||
from dateutil import parser
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from .models import Correspondent
|
||||
|
||||
|
||||
class MailFetcherError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class InvalidMessageError(MailFetcherError):
|
||||
pass
|
||||
|
||||
|
||||
class Loggable(object):
|
||||
|
||||
def __init__(self, group=None):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = group or uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
|
||||
class Message(Loggable):
|
||||
"""
|
||||
A crude, but simple email message class. We assume that there's a subject
|
||||
and n attachments, and that we don't care about the message body.
|
||||
"""
|
||||
|
||||
SECRET = os.getenv("PAPERLESS_EMAIL_SECRET")
|
||||
|
||||
def __init__(self, data, group=None):
|
||||
"""
|
||||
Cribbed heavily from
|
||||
https://www.ianlewis.org/en/parsing-email-attachments-python
|
||||
"""
|
||||
|
||||
Loggable.__init__(self, group=group)
|
||||
|
||||
self.subject = None
|
||||
self.time = None
|
||||
self.attachment = None
|
||||
|
||||
message = BytesParser(policy=policy.default).parsebytes(data)
|
||||
self.subject = str(message["Subject"]).replace("\r\n", "")
|
||||
self.body = str(message.get_body())
|
||||
|
||||
self.check_subject()
|
||||
self.check_body()
|
||||
|
||||
self._set_time(message)
|
||||
|
||||
self.log("info", 'Importing email: "{}"'.format(self.subject))
|
||||
|
||||
attachments = []
|
||||
for part in message.walk():
|
||||
|
||||
content_disposition = part.get("Content-Disposition")
|
||||
if not content_disposition:
|
||||
continue
|
||||
|
||||
dispositions = content_disposition.strip().split(";")
|
||||
if len(dispositions) < 2:
|
||||
continue
|
||||
|
||||
if not dispositions[0].lower() == "attachment" and \
|
||||
"filename" not in dispositions[1].lower():
|
||||
continue
|
||||
|
||||
file_data = part.get_payload()
|
||||
|
||||
attachments.append(Attachment(
|
||||
b64decode(file_data), content_type=part.get_content_type()))
|
||||
|
||||
if len(attachments) == 0:
|
||||
raise InvalidMessageError(
|
||||
"There don't appear to be any attachments to this message")
|
||||
|
||||
if len(attachments) > 1:
|
||||
raise InvalidMessageError(
|
||||
"There's more than one attachment to this message. It cannot "
|
||||
"be indexed automatically."
|
||||
)
|
||||
|
||||
self.attachment = attachments[0]
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.attachment)
|
||||
|
||||
def check_subject(self):
|
||||
if self.subject is None:
|
||||
raise InvalidMessageError("Message does not have a subject")
|
||||
if not Correspondent.SAFE_REGEX.match(self.subject):
|
||||
raise InvalidMessageError("Message subject is unsafe: {}".format(
|
||||
self.subject))
|
||||
|
||||
def check_body(self):
|
||||
if self.SECRET not in self.body:
|
||||
raise InvalidMessageError("The secret wasn't in the body")
|
||||
|
||||
def _set_time(self, message):
|
||||
self.time = datetime.datetime.now()
|
||||
message_time = message.get("Date")
|
||||
if message_time:
|
||||
try:
|
||||
self.time = parser.parse(message_time)
|
||||
except (ValueError, AttributeError):
|
||||
pass # We assume that "now" is ok
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return "{}.{}".format(self.subject, self.attachment.suffix)
|
||||
|
||||
|
||||
class Attachment(object):
|
||||
|
||||
SAFE_SUFFIX_REGEX = re.compile(
|
||||
r"^(application/(pdf))|(image/(png|jpeg|gif|tiff))$")
|
||||
|
||||
def __init__(self, data, content_type):
|
||||
|
||||
self.content_type = content_type
|
||||
self.data = data
|
||||
self.suffix = None
|
||||
|
||||
m = self.SAFE_SUFFIX_REGEX.match(self.content_type)
|
||||
if not m:
|
||||
raise MailFetcherError(
|
||||
"Not-awesome file type: {}".format(self.content_type))
|
||||
self.suffix = m.group(2) or m.group(4)
|
||||
|
||||
def read(self):
|
||||
return self.data
|
||||
|
||||
|
||||
class MailFetcher(Loggable):
|
||||
|
||||
def __init__(self, consume=settings.CONSUMPTION_DIR):
|
||||
|
||||
Loggable.__init__(self)
|
||||
|
||||
self._connection = None
|
||||
self._host = os.getenv("PAPERLESS_CONSUME_MAIL_HOST")
|
||||
self._port = os.getenv("PAPERLESS_CONSUME_MAIL_PORT")
|
||||
self._username = os.getenv("PAPERLESS_CONSUME_MAIL_USER")
|
||||
self._password = os.getenv("PAPERLESS_CONSUME_MAIL_PASS")
|
||||
self._inbox = os.getenv("PAPERLESS_CONSUME_MAIL_INBOX", "INBOX")
|
||||
|
||||
self._enabled = bool(self._host)
|
||||
if self._enabled and Message.SECRET is None:
|
||||
raise MailFetcherError("No PAPERLESS_EMAIL_SECRET defined")
|
||||
|
||||
self.last_checked = time.time()
|
||||
self.consume = consume
|
||||
|
||||
def pull(self):
|
||||
"""
|
||||
Fetch all available mail at the target address and store it locally in
|
||||
the consumption directory so that the file consumer can pick it up and
|
||||
do its thing.
|
||||
"""
|
||||
|
||||
if self._enabled:
|
||||
|
||||
# Reset the grouping id for each fetch
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
self.log("debug", "Checking mail")
|
||||
|
||||
for message in self._get_messages():
|
||||
|
||||
self.log("info", 'Storing email: "{}"'.format(message.subject))
|
||||
|
||||
t = int(time.mktime(message.time.timetuple()))
|
||||
file_name = os.path.join(self.consume, message.file_name)
|
||||
with open(file_name, "wb") as f:
|
||||
f.write(message.attachment.data)
|
||||
os.utime(file_name, times=(t, t))
|
||||
|
||||
self.last_checked = time.time()
|
||||
|
||||
def _get_messages(self):
|
||||
|
||||
r = []
|
||||
try:
|
||||
|
||||
self._connect()
|
||||
self._login()
|
||||
|
||||
for message in self._fetch():
|
||||
if message:
|
||||
r.append(message)
|
||||
|
||||
self._connection.expunge()
|
||||
self._connection.close()
|
||||
self._connection.logout()
|
||||
|
||||
except MailFetcherError as e:
|
||||
self.log("error", str(e))
|
||||
|
||||
return r
|
||||
|
||||
def _connect(self):
|
||||
try:
|
||||
self._connection = imaplib.IMAP4_SSL(self._host, self._port)
|
||||
except OSError as e:
|
||||
msg = "Problem connecting to {}: {}".format(self._host, e.strerror)
|
||||
raise MailFetcherError(msg)
|
||||
|
||||
def _login(self):
|
||||
|
||||
login = self._connection.login(self._username, self._password)
|
||||
if not login[0] == "OK":
|
||||
raise MailFetcherError("Can't log into mail: {}".format(login[1]))
|
||||
|
||||
inbox = self._connection.select(self._inbox)
|
||||
if not inbox[0] == "OK":
|
||||
raise MailFetcherError("Can't find the inbox: {}".format(inbox[1]))
|
||||
|
||||
def _fetch(self):
|
||||
|
||||
for num in self._connection.search(None, "ALL")[1][0].split():
|
||||
|
||||
__, data = self._connection.fetch(num, "(RFC822)")
|
||||
|
||||
message = None
|
||||
try:
|
||||
message = Message(data[0][1], self.logging_group)
|
||||
except InvalidMessageError as e:
|
||||
self.log("error", str(e))
|
||||
else:
|
||||
self._connection.store(num, "+FLAGS", "\\Deleted")
|
||||
|
||||
if message:
|
||||
yield message
|
@@ -3,11 +3,10 @@ import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from watchdog.observers import Observer
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
|
||||
from documents.consumer import Consumer
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
try:
|
||||
from inotify_simple import INotify, flags
|
||||
@@ -17,17 +16,25 @@ except ImportError:
|
||||
|
||||
class Handler(FileSystemEventHandler):
|
||||
|
||||
def __init__(self, consumer):
|
||||
self.consumer = consumer
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
||||
|
||||
def on_created(self, event):
|
||||
self.consumer.try_consume_file(event.src_path)
|
||||
self._consume(event.src_path)
|
||||
|
||||
def on_moved(self, event):
|
||||
self._consume(event.src_path)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
On every iteration of an infinite loop, consume what we can from the
|
||||
consumption directory, and fetch any mail available.
|
||||
consumption directory.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -35,12 +42,6 @@ class Command(BaseCommand):
|
||||
self.verbosity = 0
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.file_consumer = None
|
||||
self.mail_fetcher = None
|
||||
self.first_iteration = True
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
@@ -56,9 +57,6 @@ class Command(BaseCommand):
|
||||
self.verbosity = options["verbosity"]
|
||||
directory = options["directory"]
|
||||
|
||||
for d in (settings.ORIGINALS_DIR, settings.THUMBNAIL_DIR):
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(
|
||||
directory
|
||||
@@ -68,11 +66,16 @@ class Command(BaseCommand):
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
self.consumer.try_consume_file(entry.path)
|
||||
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
observer = Observer()
|
||||
event_handler = Handler(self.consumer)
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info('Using polling instead of file'
|
||||
'system notifications.')
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
else:
|
||||
observer = Observer()
|
||||
event_handler = Handler()
|
||||
observer.schedule(event_handler, directory, recursive=True)
|
||||
observer.start()
|
||||
try:
|
||||
|
@@ -1,4 +1,5 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from ...mixins import Renderable
|
||||
from ...tasks import train_classifier
|
||||
|
||||
|
@@ -1,16 +1,15 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import shutil
|
||||
import time
|
||||
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
@@ -3,15 +3,14 @@ import os
|
||||
import shutil
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from paperless.db import GnuPG
|
||||
|
||||
from ...mixins import Renderable
|
||||
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
@@ -82,6 +81,10 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
@@ -94,6 +97,14 @@ class Command(Renderable, BaseCommand):
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
document.storage_type = storage_type
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
raise FileExistsError(document.source_path)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
if settings.PASSPHRASE:
|
||||
|
||||
with open(document_path, "rb") as unencrypted:
|
||||
@@ -109,18 +120,8 @@ class Command(Renderable, BaseCommand):
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
else:
|
||||
|
||||
print("Moving {} to {}".format(document_path, document.source_path))
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
|
||||
# Reset the storage type to whatever we've used while importing
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
|
||||
Document.objects.filter(
|
||||
pk__in=[r["pk"] for r in self.manifest]
|
||||
).update(
|
||||
storage_type=storage_type
|
||||
)
|
||||
document.save()
|
||||
|
@@ -8,5 +8,5 @@ class Command(BaseCommand):
|
||||
help = "A quick & dirty way to see what's in the logs"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
for l in Log.objects.order_by("pk"):
|
||||
print(l)
|
||||
for log in Log.objects.order_by("pk"):
|
||||
print(log)
|
||||
|
@@ -1,7 +1,6 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from documents.models import Document, Tag
|
||||
|
||||
from documents.models import Document
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
|
@@ -9,16 +9,14 @@ def match_correspondents(document_content, classifier):
|
||||
correspondents = Correspondent.objects.all()
|
||||
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
|
||||
|
||||
matched_correspondents = [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
|
||||
return matched_correspondents
|
||||
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
document_types = DocumentType.objects.all()
|
||||
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
|
||||
|
||||
matched_document_types = [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
|
||||
return matched_document_types
|
||||
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
|
@@ -1,7 +1,4 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-07 12:35
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
@@ -9,11 +9,11 @@ from django_q.tasks import schedule
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('documents.tasks.train_classifier', name="Train the classifier", schedule_type=Schedule.HOURLY)
|
||||
schedule('documents.tasks.index_optimize', name="Optimize the index", schedule_type=Schedule.DAILY)
|
||||
schedule('documents.tasks.consume_mail', name="Check E-Mail", schedule_type=Schedule.MINUTES, minutes=10)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.all().delete()
|
||||
Schedule.objects.filter(func='documents.tasks.train_classifier').delete()
|
||||
Schedule.objects.filter(func='documents.tasks.index_optimize').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
18
src/documents/migrations/1002_auto_20201111_1105.py
Normal file
18
src/documents/migrations/1002_auto_20201111_1105.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-11 11:05
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1001_auto_20201109_1636'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True),
|
||||
),
|
||||
]
|
@@ -3,18 +3,15 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict, defaultdict
|
||||
from collections import OrderedDict
|
||||
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from django.template.defaultfilters import slugify
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
|
||||
|
||||
|
||||
class MatchingModel(models.Model):
|
||||
|
||||
MATCH_ANY = 1
|
||||
@@ -116,6 +113,7 @@ class DocumentType(MatchingModel):
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
# TODO: why do we need an explicit list
|
||||
TYPE_PDF = "pdf"
|
||||
TYPE_PNG = "png"
|
||||
TYPE_JPG = "jpg"
|
||||
@@ -192,7 +190,7 @@ class Document(models.Model):
|
||||
default=timezone.now, editable=False, db_index=True)
|
||||
|
||||
filename = models.FilePathField(
|
||||
max_length=256,
|
||||
max_length=1024,
|
||||
editable=False,
|
||||
default=None,
|
||||
null=True,
|
||||
@@ -220,123 +218,18 @@ class Document(models.Model):
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
|
||||
def find_renamed_document(self, subdirectory=""):
|
||||
suffix = "%07i.%s" % (self.pk, self.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
suffix += ".gpg"
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
root = os.path.normpath(Document.filename_to_path(subdirectory))
|
||||
|
||||
for filename in os.listdir(root):
|
||||
if filename.endswith(suffix):
|
||||
return os.path.join(subdirectory, filename)
|
||||
|
||||
fullname = os.path.join(subdirectory, filename)
|
||||
if os.path.isdir(Document.filename_to_path(fullname)):
|
||||
return self.find_renamed_document(fullname)
|
||||
|
||||
return None
|
||||
|
||||
@property
|
||||
def source_filename(self):
|
||||
# Initial filename generation (for new documents)
|
||||
if self.filename is None:
|
||||
self.filename = self.generate_source_filename()
|
||||
|
||||
# Check if document is still available under filename
|
||||
elif not os.path.isfile(Document.filename_to_path(self.filename)):
|
||||
recovered_filename = self.find_renamed_document()
|
||||
|
||||
# If we have found the file so update the filename
|
||||
if recovered_filename is not None:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Filename of document " + str(self.id) +
|
||||
" has changed and was successfully updated")
|
||||
self.filename = recovered_filename
|
||||
|
||||
# Remove all empty subdirectories from MEDIA_ROOT
|
||||
Document.delete_all_empty_subdirectories(
|
||||
Document.filename_to_path(""))
|
||||
else:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error("File of document " + str(self.id) + " has " +
|
||||
"gone and could not be recovered")
|
||||
|
||||
return self.filename
|
||||
|
||||
@staticmethod
|
||||
def many_to_dictionary(field):
|
||||
# Converts ManyToManyField to dictionary by assuming, that field
|
||||
# entries contain an _ or - which will be used as a delimiter
|
||||
mydictionary = dict()
|
||||
|
||||
for index, t in enumerate(field.all()):
|
||||
# Populate tag names by index
|
||||
mydictionary[index] = slugify(t.name)
|
||||
|
||||
# Find delimiter
|
||||
delimiter = t.name.find('_')
|
||||
|
||||
if delimiter == -1:
|
||||
delimiter = t.name.find('-')
|
||||
|
||||
if delimiter == -1:
|
||||
continue
|
||||
|
||||
key = t.name[:delimiter]
|
||||
value = t.name[delimiter+1:]
|
||||
|
||||
mydictionary[slugify(key)] = slugify(value)
|
||||
|
||||
return mydictionary
|
||||
|
||||
def generate_source_filename(self):
|
||||
# Create filename based on configured format
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
self.many_to_dictionary(self.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(self.correspondent),
|
||||
title=slugify(self.title),
|
||||
created=slugify(self.created),
|
||||
added=slugify(self.added),
|
||||
tags=tags)
|
||||
else:
|
||||
path = ""
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, self.pk, self.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (self.pk, self.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
def create_source_directory(self):
|
||||
new_filename = self.generate_source_filename()
|
||||
|
||||
# Determine the full "target" path
|
||||
dir_new = Document.filename_to_path(os.path.dirname(new_filename))
|
||||
|
||||
# Create new path
|
||||
os.makedirs(dir_new, exist_ok=True)
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
return Document.filename_to_path(self.source_filename)
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
@staticmethod
|
||||
def filename_to_path(filename):
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
filename
|
||||
fname
|
||||
)
|
||||
|
||||
@property
|
||||
@@ -362,125 +255,6 @@ class Document(models.Model):
|
||||
def thumbnail_file(self):
|
||||
return open(self.thumbnail_path, "rb")
|
||||
|
||||
def set_filename(self, filename):
|
||||
if os.path.isfile(Document.filename_to_path(filename)):
|
||||
self.filename = filename
|
||||
|
||||
@staticmethod
|
||||
def try_delete_empty_directories(directory):
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(Document.filename_to_path(""))
|
||||
|
||||
while directory != root:
|
||||
# Try to delete the current directory
|
||||
try:
|
||||
os.rmdir(directory)
|
||||
except os.error:
|
||||
# Directory not empty, no need to go further up
|
||||
return
|
||||
|
||||
# Cut off actual directory and go one level up
|
||||
directory, _ = os.path.split(directory)
|
||||
directory = os.path.normpath(directory)
|
||||
|
||||
@staticmethod
|
||||
def delete_all_empty_subdirectories(directory):
|
||||
# Go through all folders and try to delete all directories
|
||||
root = os.path.normpath(Document.filename_to_path(directory))
|
||||
|
||||
for filename in os.listdir(root):
|
||||
fullname = os.path.join(directory, filename)
|
||||
|
||||
if not os.path.isdir(Document.filename_to_path(fullname)):
|
||||
continue
|
||||
|
||||
# Go into subdirectory to see, if there is more to delete
|
||||
Document.delete_all_empty_subdirectories(
|
||||
os.path.join(directory, filename))
|
||||
|
||||
# Try to delete the directory
|
||||
try:
|
||||
os.rmdir(Document.filename_to_path(fullname))
|
||||
continue
|
||||
except os.error:
|
||||
# Directory not empty, no need to go further up
|
||||
continue
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def update_filename(sender, instance, **kwargs):
|
||||
# Skip if document has not been saved yet
|
||||
if instance.filename is None:
|
||||
return
|
||||
|
||||
# Check is file exists and update filename otherwise
|
||||
if not os.path.isfile(Document.filename_to_path(instance.filename)):
|
||||
instance.filename = instance.source_filename
|
||||
|
||||
# Build the new filename
|
||||
new_filename = instance.generate_source_filename()
|
||||
|
||||
# If the filename is the same, then nothing needs to be done
|
||||
if instance.filename == new_filename:
|
||||
return
|
||||
|
||||
# Determine the full "target" path
|
||||
path_new = instance.filename_to_path(new_filename)
|
||||
dir_new = instance.filename_to_path(os.path.dirname(new_filename))
|
||||
|
||||
# Create new path
|
||||
instance.create_source_directory()
|
||||
|
||||
# Determine the full "current" path
|
||||
path_current = instance.filename_to_path(instance.source_filename)
|
||||
|
||||
# Move file
|
||||
try:
|
||||
os.rename(path_current, path_new)
|
||||
except PermissionError:
|
||||
# Do not update filename in object
|
||||
return
|
||||
except FileNotFoundError:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error("Renaming of document " + str(instance.id) + " failed " +
|
||||
"as file " + instance.filename + " was no longer present")
|
||||
return
|
||||
|
||||
# Delete empty directory
|
||||
old_dir = os.path.dirname(instance.filename)
|
||||
old_path = instance.filename_to_path(old_dir)
|
||||
Document.try_delete_empty_directories(old_path)
|
||||
|
||||
instance.filename = new_filename
|
||||
|
||||
# Save instance
|
||||
# This will not cause a cascade of post_save signals, as next time
|
||||
# nothing needs to be renamed
|
||||
instance.save()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def delete_files(sender, instance, **kwargs):
|
||||
if instance.filename is None:
|
||||
return
|
||||
|
||||
# Remove the document
|
||||
old_file = instance.filename_to_path(instance.filename)
|
||||
|
||||
try:
|
||||
os.remove(old_file)
|
||||
except FileNotFoundError:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning("Deleted document " + str(instance.id) + " but file " +
|
||||
old_file + " was no longer present")
|
||||
|
||||
# And remove the directory (if applicable)
|
||||
old_dir = os.path.dirname(instance.filename)
|
||||
old_path = instance.filename_to_path(old_dir)
|
||||
Document.try_delete_empty_directories(old_path)
|
||||
|
||||
|
||||
class Log(models.Model):
|
||||
|
||||
@@ -518,7 +292,7 @@ class FileInfo:
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
|
||||
# TODO: what is this used for
|
||||
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
|
@@ -20,13 +20,16 @@ from django.utils import timezone
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
# TODO: isnt there a date parsing library for this?
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
|
||||
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
|
||||
)
|
||||
|
||||
@@ -39,17 +42,16 @@ def get_parser_class(doc):
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
parsers = []
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parsers.append(response[1])
|
||||
|
||||
#TODO: add a check that checks parser availability.
|
||||
|
||||
options = []
|
||||
for parser in parsers:
|
||||
result = parser(doc)
|
||||
if result:
|
||||
options.append(result)
|
||||
|
||||
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
parser_test = parser_declaration["test"]
|
||||
|
||||
if parser_test(doc):
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
@@ -59,7 +61,7 @@ def get_parser_class(doc):
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
|
||||
def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
@@ -74,7 +76,7 @@ def run_convert(input, output, density=None, scale=None, alpha=None, strip=False
|
||||
args += ['-trim'] if trim else []
|
||||
args += ['-type', str(type)] if type else []
|
||||
args += ['-depth', str(depth)] if depth else []
|
||||
args += [input, output]
|
||||
args += [input_file, output_file]
|
||||
|
||||
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
|
||||
|
||||
@@ -100,17 +102,17 @@ class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentParser:
|
||||
class DocumentParser(LoggingMixin):
|
||||
"""
|
||||
Subclass this to make your own parser. Have a look at
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group, progress_callback):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = logging_group
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
def get_thumbnail(self):
|
||||
@@ -121,16 +123,19 @@ class DocumentParser:
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||
|
||||
self.log('debug', 'Execute: ' + " ".join(args))
|
||||
self.log('debug', 'Execute: ' + " ".join(args))
|
||||
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
|
||||
return out_path
|
||||
return out_path
|
||||
else:
|
||||
return in_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
@@ -222,11 +227,6 @@ class DocumentParser:
|
||||
|
||||
return date
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
shutil.rmtree(self.tempdir)
|
||||
|
@@ -105,7 +105,6 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
class LogSerializer(serializers.ModelSerializer):
|
||||
|
||||
|
||||
class Meta:
|
||||
model = Log
|
||||
fields = (
|
||||
|
@@ -1,5 +1,5 @@
|
||||
from django.dispatch import Signal
|
||||
|
||||
document_consumption_started = Signal(providing_args=["filename"])
|
||||
document_consumption_finished = Signal(providing_args=["document"])
|
||||
document_consumer_declaration = Signal(providing_args=[])
|
||||
document_consumption_started = Signal()
|
||||
document_consumption_finished = Signal()
|
||||
document_consumer_declaration = Signal()
|
||||
|
@@ -6,9 +6,13 @@ from django.conf import settings
|
||||
from django.contrib.admin.models import ADDITION, LogEntry
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.db import models, DatabaseError
|
||||
from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@@ -141,17 +145,65 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
)).wait()
|
||||
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
|
||||
if not isinstance(instance, Document):
|
||||
return
|
||||
|
||||
for f in (instance.source_path, instance.thumbnail_path):
|
||||
try:
|
||||
os.unlink(f)
|
||||
except FileNotFoundError:
|
||||
pass # The file's already gone, so we're cool with it.
|
||||
|
||||
delete_empty_directories(os.path.dirname(instance.source_path))
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@receiver(models.signals.post_save, sender=Document)
|
||||
def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not instance.filename:
|
||||
# Can't update the filename if there is not filename to begin with
|
||||
# This happens after the consumer creates a new document.
|
||||
# The PK needs to be set first by saving the document once. When this
|
||||
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
|
||||
# renamed anyway. In all other cases, instance.filename will be set.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
old_path = instance.source_path
|
||||
new_filename = generate_filename(instance)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
|
||||
return
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
instance.filename = new_filename
|
||||
instance.save()
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
except DatabaseError as e:
|
||||
os.rename(new_path, old_path)
|
||||
instance.filename = old_filename
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
delete_empty_directories(os.path.dirname(old_path))
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
|
@@ -1,20 +1,15 @@
|
||||
import logging
|
||||
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task, result
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.mail import MailFetcher
|
||||
from documents.consumer import Consumer, ConsumerError
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
def consume_mail():
|
||||
MailFetcher().pull()
|
||||
|
||||
|
||||
def index_optimize():
|
||||
index.open_index().optimize()
|
||||
|
||||
@@ -55,3 +50,27 @@ def train_classifier():
|
||||
logging.getLogger(__name__).error(
|
||||
"Classifier error: " + str(e)
|
||||
)
|
||||
|
||||
|
||||
def consume_file(path,
|
||||
override_filename=None,
|
||||
override_title=None,
|
||||
override_correspondent_id=None,
|
||||
override_document_type_id=None,
|
||||
override_tag_ids=None):
|
||||
|
||||
document = Consumer().try_consume_file(
|
||||
path,
|
||||
override_filename=override_filename,
|
||||
override_title=override_title,
|
||||
override_correspondent_id=override_correspondent_id,
|
||||
override_document_type_id=override_document_type_id,
|
||||
override_tag_ids=override_tag_ids)
|
||||
|
||||
if document:
|
||||
return "Success. New document id {} created".format(
|
||||
document.pk
|
||||
)
|
||||
else:
|
||||
raise ConsumerError("Unknown error: Returned document was null, but "
|
||||
"no error message was given.")
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,208 +0,0 @@
|
||||
Return-Path: <sender@example.com>
|
||||
X-Original-To: sender@mailbox4.mailhost.com
|
||||
Delivered-To: sender@mailbox4.mailhost.com
|
||||
Received: from mx8.mailhost.com (mail8.mailhost.com [75.126.24.68])
|
||||
by mailbox4.mailhost.com (Postfix) with ESMTP id B62BD5498001
|
||||
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
Received: from localhost (localhost.localdomain [127.0.0.1])
|
||||
by mx8.mailhost.com (Postfix) with ESMTP id B41796F190D
|
||||
for <sender@mailbox4.mailhost.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
X-Spam-Flag: NO
|
||||
X-Spam-Score: 0
|
||||
X-Spam-Level:
|
||||
X-Spam-Status: No, score=0 tagged_above=-999 required=3
|
||||
tests=[RCVD_IN_DNSWL_NONE=-0.0001]
|
||||
Received: from mx8.mailhost.com ([127.0.0.1])
|
||||
by localhost (mail8.mailhost.com [127.0.0.1]) (amavisd-new, port 10024)
|
||||
with ESMTP id 3cj6d28FXsS3 for <sender@mailbox4.mailhost.com>;
|
||||
Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
Received: from smtp.mailhost.com (smtp.mailhost.com [74.55.86.74])
|
||||
by mx8.mailhost.com (Postfix) with ESMTP id 527D76F1529
|
||||
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:17 +0000 (UTC)
|
||||
Received: from [10.114.0.19] (nl3x.mullvad.net [46.166.136.162])
|
||||
by smtp.mailhost.com (Postfix) with ESMTP id 9C52420C6FDA
|
||||
for <paperless@example.com>; Thu, 4 Feb 2016 22:01:16 +0000 (UTC)
|
||||
To: paperless@example.com
|
||||
From: Daniel Quinn <sender@example.com>
|
||||
Subject: Test 0
|
||||
Message-ID: <56B3CA2A.6030806@example.com>
|
||||
Date: Thu, 4 Feb 2016 22:01:14 +0000
|
||||
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101
|
||||
Thunderbird/38.5.0
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/mixed;
|
||||
boundary="------------090701020702030809070008"
|
||||
|
||||
This is a multi-part message in MIME format.
|
||||
--------------090701020702030809070008
|
||||
Content-Type: text/plain; charset=utf-8
|
||||
Content-Transfer-Encoding: 7bit
|
||||
|
||||
The secret word is "paperless" :-)
|
||||
|
||||
--------------090701020702030809070008
|
||||
Content-Type: application/pdf;
|
||||
name="test0.pdf"
|
||||
Content-Transfer-Encoding: base64
|
||||
Content-Disposition: attachment;
|
||||
filename="test0.pdf"
|
||||
|
||||
JVBERi0xLjQKJcOkw7zDtsOfCjIgMCBvYmoKPDwvTGVuZ3RoIDMgMCBSL0ZpbHRlci9GbGF0
|
||||
ZURlY29kZT4+CnN0cmVhbQp4nFWLQQvCMAyF7/kVOQutSdeuHZSA0+3gbVDwIN6c3gR38e/b
|
||||
bF4kkPfyvReyjB94IyFVF7pgG0ze4TLDZYevLamzPKEvEFqbMEZfq+WO+5GRHZbHNROLy+So
|
||||
UfFi6g7/RyusEpUl9VsQxQTlHR2oV3wUEzOdhOnXG1aw/o1yK2cYCkww4RdbUCevCmVuZHN0
|
||||
cmVhbQplbmRvYmoKCjMgMCBvYmoKMTM5CmVuZG9iagoKNSAwIG9iago8PC9MZW5ndGggNiAw
|
||||
IFIvRmlsdGVyL0ZsYXRlRGVjb2RlL0xlbmd0aDEgMTA4MjQ+PgpzdHJlYW0KeJzlOWt0G9WZ
|
||||
95uRbNmWLckPWY4SaRTFedmybI8T4rw8sS3ZiZ1YfqWSCbFkS7YEtiQkJSE8GlNeOQ5pUmh5
|
||||
Zkt2l+XQNl3GhLaBpcWw0D19UGALLRRS0gM9nD0lxVBK9wCx97tXI0UJAc727L8d+c587/u9
|
||||
7p0rOZXYEyJaMkV4Io1OBuLOqmqBEPJLQqB0dG9K2NRTsQHhM4Rw/zkWH5+870e7PiRE9Rgh
|
||||
+Y+NT+wf+/b3e4YI0YYJKX41HAoEfxj6vUjIIgltrA0jYef8/nzEr0F8WXgydY2bP7QO8WOI
|
||||
SxOx0cDxxbUmxN9AfOlk4Jr4apWLI8SMKBGigcmQpYXrRBx9KtobjyVTQbJsgZDl91B+PBGK
|
||||
d9838hzipwjhjyIN8EMvLYJ5FOd4lTovX1NQWKQtLtGR/3eX+jCpIJ3qTURH4ux+wcWfIFXk
|
||||
XkIW3qXY+ft898LH/5deaNKPe8hD5DFymLxGrlAYbuIhEbIHKbnX0+QlpNLLQ4bId8n055g9
|
||||
QU4hPy3nJ0doJJe8PORucpL8xwWzeMgkuQ59+QF5DRrIz7BVYuQD0JAbyXNo9QOkbb+UKa4E
|
||||
b2MMHMuhvk7u5w6RbdzbiNxLOZyT05NnyTHYjZZTGOfhbMQbP2P0NnID3vtJmOxFmF3qTZ/+
|
||||
jhQs/AWjuoFsI18jW8hEjsaT8ABfiPUbIA9gTp9mNGeGmd/JX8n9kOPO3YnIN8g4jgBg7Nxh
|
||||
fsvnZOh/ffGDpBhW8dWk4FJcrono5j/mGhc+5JeRQjK4MJehLXQt/IUPzEdVw6rF6k2qX3zR
|
||||
HHnfUE2iNln44/x180H1DvVDWK2HcePouHzI5x0c6O/r9fTs2N7dtW1rZ4fb1d7WukVq2bxp
|
||||
44b1zesuW7umod5Z56hduWJ59TL7UpvVVG7Q60qKiwoLNPl5ahXPAakVZPC7ZL5aMLgDdpc9
|
||||
0OmoFVymcLuj1mV3+2UhIMj4UC23d3Yykj0gC35BXo6PQA7ZL0soOXaRpJSWlLKSoBc2ko10
|
||||
CrsgP99uF07BUK8X4cPtdp8gn2XwdgarljOkGBGbDTWYV9RbwSW794anXX70EWaKCtvsbaFC
|
||||
Ry2ZKSxCsAgheaU9PgMrNwMDuJWu9TMc0RTTaTFSVyAoe3q9rnazzeZz1G6VS+ztjEXamEk5
|
||||
r03OZyaFCHWdHBJmamenbz+lJyP+Gm3QHgzs8sp8AHWnedf09G2yoUZeZW+XV137tgkjD8m1
|
||||
9naXXEOtdvVl5+k6PyXI6mq9XZj+K8Fw7GffvZASUCh51fq/EgrKXJsMfV4bvcxuzPX0tNsu
|
||||
uKf904FTC1MjdkFvn57RaqfjLkw38XjRxKmFJw6ZZfftPlnvD8N6nxK6u69LLuu93Ctz1W4h
|
||||
HEAK/rXYbevMNkNWxvN5bIJpweRghm02moZDpyQygog81etN4wIZMT9KJGeNT+b8lDOb4VQM
|
||||
Us5UhpNV99uxtl393mlZVb01aHdhxg8F5KkR7K4raWHsernkI7PNPl1qEJqdPiYroFdbgxFB
|
||||
Vi/HJKFWrgL2DVWZ1jOk5KP046wZJ1huKBWa7WiG2nHZXX7lb2/YhAYETHRnTboRBryy1I6A
|
||||
FFAq5pqpd6JGwI8Fi7SzYspOe1wut7dmq0vdckX6vUxFUZPL22TiH1W0ZKeLrSvBNe1vT7tA
|
||||
bdl7vY8TceHMTJNgPimSJuJrp8LGNuyy5a5pb3BMtvrNQVx3Y4LXbJMlH1bYZ/eGfLTtMEOr
|
||||
zphZc/hYrwx4u/rtXb1D3nWKI2kGNaeqdl1kxu41p81gA8qaao3g5cy8DwX1SBDcCNhbN+Jd
|
||||
zq/W4NBjwhmVNm7rRsELZpKRRjfkVYIr1K7IUfwCo2raTm2dGWt5FEU7bZ1mm8+Wvhy1HLIF
|
||||
ZWLU0NCkdmZYuE0hQ4P92dbJSDSXJtr0gtcesvvsYUGWPF4aG00Py7KSDJZzpVYDF2A5ycI0
|
||||
ERuyMwhNpuyuMecmV+5geBbtvIi9NcMWpjX2rv5patyuGCTo+VaZ0BaW1hnMbC+gC9qOe6+g
|
||||
xyXNFvT0jCTRxRxeT43Ytwan7f3ejUwa95MbzNfSuUpJF3QNtDpqcWtrnbHDwd4ZCQ72D3kf
|
||||
1+O58OCA91EOuDZ/q29mGfK8jwv40mBUjlIpkSICRailPkQ0TN78uETIFOOqGIHho6eAMJom
|
||||
QwMyeopL0/TpiZaziSTCIUeV5kgZaRXSNGnaFKOxa4bQlEmFakkjFUharpgzzwAlPYqUJ/Ac
|
||||
WwDkpBaKwTyDWn2MfAqmZgokc1piCiWktIcHB89PPTjkPanFt7OZ3XGiVnphu5jCWGx8rbiE
|
||||
IG2U633hab+PLjZixNLgH8hg34xlsm9GR/K0cqE91CoX2VspvYXSW9L0PErPxxYFI6D6FNbe
|
||||
IwPtgMu9NlySwqKfmaf1Z2mlfLipTOv/6MCMVeP3hqfxDFoOG6XTpVwRp+ErjFqigQJeoykw
|
||||
8AW831fAl3KEG/aR0hYj6IxwxghPGeGIEQ4YYdgISBQY/ao5I7xghOOMFzdCjxGsjJGmy0Z4
|
||||
gLFiTE0yQj0TIEZ4k3GnGL2eUTYssHnSakcYo4fx5hhdzsyRVhCYzhwzNMummWJcdM2ZmeOK
|
||||
7HV15koo1+6L6J/hUB5pqTEQ0cTuBtHkHN59hWgohcpmg9hQb1tzmcG+VAd2g81gX1EHNWCo
|
||||
rIANr4jnrjC3qY61my0/v6bhlTVm1d3lL8GG+edeyi/65CrzGnqgAlKOJ7c/4neCJeQJaT8p
|
||||
L68qLikpqCqwWJcs8viWkHJEKqs8Pm1lRRnHqdWGPp9af9wKZ6wwawW9FYgVmhE5aoW4FfxW
|
||||
8FhBskK9FQQrWBkbWVMZLrJeZJqyFY7n0HOTk0hckAAldoy6RaSAyNJQCs0Ye/rTUA/l+ZtB
|
||||
bDRWYOA0G032pfkKuGKNDdz5nT9qufb6xPxVNzy0+6YD88F9t0Mj/1G4btXGr9927q4qh6OK
|
||||
231iybkyCqk5kwMXTg2eT0vV3aQIvy39gzRGtNo8g6HSyBf0+wgPep6vkCpKPb4KndagM3h8
|
||||
uorySlBVQvOHlXC0Erh4JfgrwVMJUiXMVoJcCccZKlSCvhJIJcwxCormSl7YIzQFwywL2fKT
|
||||
RSb9r7D4LAEGUQk+z750+ZqmtZgA/nzQ10mOWkmqdUiF/zhfdfwWqFG9mcalT9bTOHmhiq7B
|
||||
gYV3uV/zz5GVxCc12fLLFxVjS6xaXWzjKystHp+5Us8XeXz5vHFqNcRXg381eFaDsBoeWQ3D
|
||||
q6FnNWT8JVgewmpUSrA26QKhg1kPV6wRK41i45omJ9RxzN3KCvuK5faleRXlxkoLz/165vvu
|
||||
79Q7GrqueeZeX2hX43eOjt/vXL0m0Tu4fcedQy120Nx+dEnpOze1P3Rt0xJb+6j7+iPW5yed
|
||||
nvbmHYsa69p20q8ZpHPhXf5q/mlixt1lUmoxaKqrVYJWW6Xi8di/tHBpr89UYTAsxooZrAZO
|
||||
yxsMRFNozFdhjBWkwuMj+qkVMLwCpBWAwBVYBEw+MbEhljY708knzawn0yvQoESp9N8KDNbQ
|
||||
tBlaYE3TcrYu16yF/BKoKBcb114GL933jT3z82WJmfe3Hr/ncMe2YP/Sdf8E5KZbh4+0jzby
|
||||
T3/1a+duqXLsToBp93VbeNWdgV3OPc/b5y0q9e6obDWxNYs1c6huJEbSIa0oLCnJL+P5SpNK
|
||||
W6T1+Aryi3S4pg29PmJ8wASyCVpM4DTRMiUybSSKivfNpc2NjbSH1NhABvuaFhArxAq7oRzr
|
||||
dFlFCcAO//B1N4RafvvbDfXr++03lyfGuTsdK155ZeDcgS2t+i0mK8u5B3Puxh6qIIvJYWmo
|
||||
CkC3SFOhq1hiqSKY6CprFSa6qkpbWmr0+Er1WnWvT2uctYBsgeMWOGqBKQvELeC3gMcCxAKb
|
||||
8SFZoN4CggX0FphjciiU2R2yO+MVSnFoRUzOzMJINx5bGxXlFqBpx2CwBQ3YdYKhArDlbE3L
|
||||
QbXpwPjab9bX/8vO13/xq6cgMn93OAZ37ILXSqfv9ZQWrbPWvQvqjz6YH+uDYw8/ePJeGus2
|
||||
jPUd3C/LcMecknrKVUWkqkqv0lusZXqPrwz3A4yY5GOD5eurUIGr7PVxRtwGO3J3RsI2wSlG
|
||||
SQN+RldWvxLk+Z0v04HnNz4WXnWeXTA0leJKWr4JcNHT9gNWPMNyu8D9+uq75w/87uWJWN63
|
||||
oT01/9/z1qmbrx7yJeY/dQ/BH/4GUGm75UOT4+PHqxzw/E/+bQX3joHVcwfG+CjWsxA77Anp
|
||||
RoO6iKhJpUlT4vFp9Fy5BwMSTEBMcMYEHhPUm0BvgjmGvmiCWdZ1x01w1ARTJoibwG8CyQRp
|
||||
lQ0PMJKHkeoZVc8YufrHmWZaDe9XfO6bMbtdZpdpNkFYfL0tsy/mNyn7DPYC/+h858uvvvrG
|
||||
b3732FdvvWnPvhtvnoLX5w3z7//507/95dVnnjjz1o+fTb8baR52YB6MxC9txCwY1UbMgg7f
|
||||
hhq9sZwv7/XxRvR8c24kcyyGdABIf8QEw3TxZd3fnd3MxVxfq7E/BQPbFA10UxTSa5Df0XBi
|
||||
aP6y/3rttuOX1fSn5j/85+/dMdG8bBW8/6dz1vmPH3LOh1/+gY36akZfT/Mn0NdvScOktFil
|
||||
KigtqDSpy4xl2IpGnQqPpX2+Yr1RW4D+Vxxn2Z7NJL/5TE49CCtgtm5yJpw0RTBBbtpzX9NE
|
||||
eUUrj5yXNH0H0K5UenQFXY1VtGOh+fj1E18Hcd/8nzUdT7TMXQMW0J6wcu9UOT69r8rRvaIZ
|
||||
yrkxfFPRGPGdnFeF9WiAR6UFgzZv8WIbWbnS4bBpebGxoc7ja9CttC02aB01Do/PqqupqMrL
|
||||
Kygo7/MV6FfgMYev7vPx+r0i7BRhrQjLRDCKkCfCRyK8LcLLIvxUhAdFuEuEERHAI0K7CPVM
|
||||
rlwElQjhuYzgYyKkRJBEaGJs5H0owusizIogMxs3ixAUFRNpGX1G7EURnhXheyIcZWJXibBB
|
||||
BCEzx7r0BMdF8IswkJmjnGm+zTS/KcIUTi/V5PDNTPdt5gAnM4E4mx5n1YmgUdbL8BcfMy88
|
||||
heYcxM6r5wjlbE6Z45lyPsuc0CqzJzTWAOyEVknvVZA9ppVw+edPbcsvOrZ1PSy59izZ/kL7
|
||||
3P75wduPL3K5WioMh+dbDw0Oem86PL9z3z4o4/0165uaa1rn/6Qc5LwnNIXFqrVbMmi/b8m5
|
||||
quyBh/WRE5vhD9hHi8msdAMpKzMVabX5pvwllsV40l2sK0PEaPL4Co0VpbRt9LRtHrTA2xZ4
|
||||
1gL4QlFZoBmRb1ogZYGgBQYs0G6BJgsss4CZsfHNxuW+1/Bt9qIFsq+8LD03o8N/18n3wnPv
|
||||
RRls3/6v69Pn3t7BITz4Xnn11aDl/bXN2WOvt39YOfcq58HbFt6C/eQVPPeapCKSl6ct5gvu
|
||||
v5wvIy3KmRP3qpwDJ+x3NTW53KLo3tXQ2dkgut3s/y30Pzblq28Z1m38K2dN/9b/yzuXdJ7/
|
||||
JXfhrbwqNf0FXJMloV6+bd5FvpJLueDS5zXjN8a3SLWKkHKumdTwS8gAR397Pkw6ES/Hpwd5
|
||||
23DsQHgHPs2oU4NPJ0eUX9KfgR3wDLcaP8e4t/kh/pcqj+ohtSlvY97P895VZtWTRhoDi0SP
|
||||
/bILgX/nf0p4xrVANOvbzqyfgJI7FZgj+WRMgXk8i04qsAplDiqwmpSQexQ4j+jIQwqcT64l
|
||||
P1BgDX43dipwASmBNgUuhCj0KnARWcw9lf0vVx33ugIXkzV8gQKXkEX8Zuq9iv46f4L3KjAQ
|
||||
QaVSYI6UqJYpME/WqhoVWIUyYQVWk8WqgwqcRyyqBxU4n3yoekaBNWSl+ocKXEAWq3+vwIXc
|
||||
G+qPFbiIrNP8RoG1ZFdBiQIXkysLrlTgEtJU8HJ7ZDySilwbCgrBQCogjMbi+xOR8XBKWDm6
|
||||
Smisb6gXOmKx8YmQ0BZLxGOJQCoSi9YVtl0s1ij0oYnOQKpW2BodreuOjITSskJ/KBEZ6wuN
|
||||
75kIJLYkR0PRYCghOISLJS7Gd4YSSYo01tXX1zWc514sHEkKASGVCARDk4HEVUJs7EJHhERo
|
||||
PJJMhRJIjESFwbr+OsETSIWiKSEQDQoDWcWesbHIaIgRR0OJVACFY6kwunrlnkQkGYyM0tmS
|
||||
ddkIctLRnwrtDQnbA6lUKBmLtgaSOBd6NhCJxpK1wr5wZDQs7AskhWAoGRmPInNkv3ChjoDc
|
||||
AMYSjcb2osm9oVr0eywRSoYj0XEhSUNWtIVUOJCiQU+GUonIaGBiYj/WbDKOWiNYpH2RVBgn
|
||||
ngwlhR2hfUJfbDIQ/W5d2hXMzRgmVYhMxhOxvcxHR3I0EQpFcbJAMDASmYik0Fo4kAiMYsYw
|
||||
bZHRJMsIJkKIB6IO155ELB5CT7/S0X1eEB1MZzMZm9iLM1PpaCgUpDOi23tDE6iEE0/EYlfR
|
||||
eMZiCXQ0mAo7cjwfi0VTqBoTAsEgBo7Zio3umaR1wjSnMs4FRhMx5MUnAim0MpmsC6dS8fVO
|
||||
5759++oCSmlGsTJ1aNn5RbzU/nhIqUeCWpmc6MbyR2np9rD60iD6t3YLPXHMjxudExSBWiHT
|
||||
mg11DcoUmMZIPJWsS0Ym6mKJcWePu5u0kwgZx5HCcS0JkSARcAQQDyA0SmIkTvaTBJMKI1Ug
|
||||
K5G6Cp+NpJ404BBIB0rFkD+B+gJpQziBWvQeYHZjJErq8FtE25daa0SoT/Gik2nXIrQV9UfR
|
||||
QjfqjSA3165A+hklgvss1Rwne9CPAFK2kCRqhVAmyCQE4sDxZTa+jL+TQckspxH9qsdPHXp/
|
||||
Kd0vsxxBWwLLdYpxqK+TzP+rkBZDvS/KiIByIVa/JHJCDAsyq9T2IEr0MykP06S5SLHZokxq
|
||||
4BIz9uCMY6g/ymqZkRxltmlPpC3HEA4rWb0SM55gHgSZXia2JM782Rpcujv6mXd72ZzbGZ3i
|
||||
ScZrRTypxJXO2QDzIoZUmot96AmdN8zgAMtnkGnTLosqmiPYd8IXziMougGlLlE2x17FS6pT
|
||||
q+R7jN2TbN4oziEw/9JVvnBugeUpwLKervQkclNMdhTpE/jZr6yzScxKeq4RZSXtY+syrEQ8
|
||||
yewKZAc+97GuiLG6RW1LWY3PZyXdN2NKpwpMN45wjEWRyaOD1YZGEmKeUijA1v4IakywudO+
|
||||
hVl3BFhtQ0qtUyyCTL6CSqTU6zijOIiL9QVd8SElp1/BnaL7khbTGcztTVqTCeZvMsd2lHkb
|
||||
zMaYzjaVmlBmSkc8wXakq7L1GWP9ls5okFlzfE7Ox1huUsqsMeZRED/piqd7K4a6e1g90usp
|
||||
3c2pz2QuwPIbU/TibF9KKb5MsvURZh0YJ+vxbOlE7+injvVh7qoZVdZMneKz8+/Wo37FWQZz
|
||||
10ci68sk+titrP5odtXtyVm/mUr04x7UzfaLuNI/biVzwkUW6Kq5eNdsYPvlhVGkuzGCeIr5
|
||||
k2S5rGMxjCO/B2foZufo9DcHG/p0iWumwLNlBEIEIAzjpIxYwU92wDAZhC1kE0j4lJDXis82
|
||||
xOmzDjaRKZTbhPTNiG9E+gbcPK14b8HRg+MIDhWOtEQ9Sjjx6VRwB+K1qPEC3oENSm1BKn1u
|
||||
Q7wTnx3K0410Fz5dCr4VcXwSP+TjQbyF3Z8ClXQSzpyDF86BcA4OfAKeT2Dqg6MfcO/PrbI+
|
||||
MvfUHNfz3vB7j7zH178HuvdAQ87qz3rO+s/Gzx4/m1eoexe05E9geOvMOuubm04P/n7TG4Pk
|
||||
NEZ2uv605/TUafm0+jTwg2/wRqt+Vpitn43PTs2+OHtmdm5WM/WToz/hfvyk06p70vokZz3Z
|
||||
c/LASd7/MOgetj7Mee73388dPQa6Y9ZjzmP8fffWWe/tsFjvvmuF9cxdc3dxpxZmT95VbHA/
|
||||
CT3QTTZhDnec5Besj2ypgO0Ylg7vVhxOHD04YjiO4MDvPShuxeGEbmkdP/wtKLrDfEfNHdfd
|
||||
cegOdfzWqVuP3spP3XL0Fu6RvU/t5ZKeVdZYtMYa7VhtrRJNg/kiP5iH0+Ds0taR6pVu/7Bk
|
||||
HUahy4fqrUMdq6xlYumgGgNWoaCOt/ItfA8f44/wT/H5mj6PxdqL44xnzsNJngKtW9dj7XH2
|
||||
8KcWzkihLhta2xbfNrWN3+peZe3sWGfVdVg7nB0vdLzZ8V5H3nAHPIB/7kfcT7l5yb3K6Zbc
|
||||
Fpt7cad50ChWDBpAN6gXdYMcYKFFMujULeg4nW5Yd0DH60gL4aaMoIZTcHRmoL+mputU/kJf
|
||||
l6zxXC7DQbm6n96l3iE576BMBocu984AfN13y+HDpHVJl9zY75X9S3xdchABiQJTCOiXzBhJ
|
||||
qy+ZTNWwC2pqEN6Dd1KzpwaJu5NpKsnySU0SkrhHJZkS1FCBNA54r6E8JFA9QO3dSUJvlFmT
|
||||
VqLaScUcU07fGGDa/T/LhW2oCmVuZHN0cmVhbQplbmRvYmoKCjYgMCBvYmoKNjI5MQplbmRv
|
||||
YmoKCjcgMCBvYmoKPDwvVHlwZS9Gb250RGVzY3JpcHRvci9Gb250TmFtZS9CQUFBQUErTGli
|
||||
ZXJhdGlvblNlcmlmCi9GbGFncyA0Ci9Gb250QkJveFstNTQzIC0zMDMgMTI3NyA5ODFdL0l0
|
||||
YWxpY0FuZ2xlIDAKL0FzY2VudCA4OTEKL0Rlc2NlbnQgLTIxNgovQ2FwSGVpZ2h0IDk4MQov
|
||||
U3RlbVYgODAKL0ZvbnRGaWxlMiA1IDAgUgo+PgplbmRvYmoKCjggMCBvYmoKPDwvTGVuZ3Ro
|
||||
IDI5Mi9GaWx0ZXIvRmxhdGVEZWNvZGU+PgpzdHJlYW0KeJxdkctuwyAQRfd8Bct0EfmROA/J
|
||||
spQmseRFH6rbD3BgnCLVGGGy8N+XmUlbqQvQmZl7BxiSY3NqrAnJqx9VC0H2xmoP03jzCuQF
|
||||
rsaKLJfaqHCPaFdD50QSve08BRga249lKZK3WJuCn+XioMcLPIjkxWvwxl7l4uPYxri9OfcF
|
||||
A9ggU1FVUkMf+zx17rkbICHXstGxbMK8jJY/wfvsQOYUZ3wVNWqYXKfAd/YKokzTSpZ1XQmw
|
||||
+l8tK9hy6dVn56M0i9I0LdZV5Jx4s0NeMe+R18TbFXJBnKfIG9ZkyFvWUJ8d5wvkPTPlD8w1
|
||||
8iMz9Tyyl/Qnzp+Qz8xn5JrPPdOj7rfH5+H8f8Ym1c37ODL6JJoVTslY+P1HNzp00foG7l+O
|
||||
gwplbmRzdHJlYW0KZW5kb2JqCgo5IDAgb2JqCjw8L1R5cGUvRm9udC9TdWJ0eXBlL1RydWVU
|
||||
eXBlL0Jhc2VGb250L0JBQUFBQStMaWJlcmF0aW9uU2VyaWYKL0ZpcnN0Q2hhciAwCi9MYXN0
|
||||
Q2hhciAxNQovV2lkdGhzWzc3NyA2MTAgNTAwIDI3NyAzODkgMjUwIDQ0MyAyNzcgNDQzIDUw
|
||||
MCA1MDAgNDQzIDUwMCA3NzcgNTAwIDI1MApdCi9Gb250RGVzY3JpcHRvciA3IDAgUgovVG9V
|
||||
bmljb2RlIDggMCBSCj4+CmVuZG9iagoKMTAgMCBvYmoKPDwvRjEgOSAwIFIKPj4KZW5kb2Jq
|
||||
CgoxMSAwIG9iago8PC9Gb250IDEwIDAgUgovUHJvY1NldFsvUERGL1RleHRdCj4+CmVuZG9i
|
||||
agoKMSAwIG9iago8PC9UeXBlL1BhZ2UvUGFyZW50IDQgMCBSL1Jlc291cmNlcyAxMSAwIFIv
|
||||
TWVkaWFCb3hbMCAwIDU5NSA4NDJdL0dyb3VwPDwvUy9UcmFuc3BhcmVuY3kvQ1MvRGV2aWNl
|
||||
UkdCL0kgdHJ1ZT4+L0NvbnRlbnRzIDIgMCBSPj4KZW5kb2JqCgo0IDAgb2JqCjw8L1R5cGUv
|
||||
UGFnZXMKL1Jlc291cmNlcyAxMSAwIFIKL01lZGlhQm94WyAwIDAgNTk1IDg0MiBdCi9LaWRz
|
||||
WyAxIDAgUiBdCi9Db3VudCAxPj4KZW5kb2JqCgoxMiAwIG9iago8PC9UeXBlL0NhdGFsb2cv
|
||||
UGFnZXMgNCAwIFIKL09wZW5BY3Rpb25bMSAwIFIgL1hZWiBudWxsIG51bGwgMF0KL0xhbmco
|
||||
ZW4tR0IpCj4+CmVuZG9iagoKMTMgMCBvYmoKPDwvQ3JlYXRvcjxGRUZGMDA1NzAwNzIwMDY5
|
||||
MDA3NDAwNjUwMDcyPgovUHJvZHVjZXI8RkVGRjAwNEMwMDY5MDA2MjAwNzIwMDY1MDA0RjAw
|
||||
NjYwMDY2MDA2OTAwNjMwMDY1MDAyMDAwMzUwMDJFMDAzMD4KL0NyZWF0aW9uRGF0ZShEOjIw
|
||||
MTYwMjA0MjIwMDAyWicpPj4KZW5kb2JqCgp4cmVmCjAgMTQKMDAwMDAwMDAwMCA2NTUzNSBm
|
||||
IAowMDAwMDA3NTA5IDAwMDAwIG4gCjAwMDAwMDAwMTkgMDAwMDAgbiAKMDAwMDAwMDIyOSAw
|
||||
MDAwMCBuIAowMDAwMDA3NjUyIDAwMDAwIG4gCjAwMDAwMDAyNDkgMDAwMDAgbiAKMDAwMDAw
|
||||
NjYyNSAwMDAwMCBuIAowMDAwMDA2NjQ2IDAwMDAwIG4gCjAwMDAwMDY4NDEgMDAwMDAgbiAK
|
||||
MDAwMDAwNzIwMiAwMDAwMCBuIAowMDAwMDA3NDIyIDAwMDAwIG4gCjAwMDAwMDc0NTQgMDAw
|
||||
MDAgbiAKMDAwMDAwNzc1MSAwMDAwMCBuIAowMDAwMDA3ODQ4IDAwMDAwIG4gCnRyYWlsZXIK
|
||||
PDwvU2l6ZSAxNC9Sb290IDEyIDAgUgovSW5mbyAxMyAwIFIKL0lEIFsgPDRFN0ZCMEZCMjA4
|
||||
ODBCNURBQkIzQTNEOTQxNDlBRTQ3Pgo8NEU3RkIwRkIyMDg4MEI1REFCQjNBM0Q5NDE0OUFF
|
||||
NDc+IF0KL0RvY0NoZWNrc3VtIC8yQTY0RDMzNzRFQTVEODMwNTRDNEI2RDFEMUY4QzU1RQo+
|
||||
PgpzdGFydHhyZWYKODAxOAolJUVPRgo=
|
||||
--------------090701020702030809070008--
|
217
src/documents/tests/test_api.py
Normal file
217
src/documents/tests/test_api.py
Normal file
@@ -0,0 +1,217 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import override_settings
|
||||
from rest_framework.test import APITestCase
|
||||
|
||||
from documents.models import Document, Correspondent, DocumentType, Tag
|
||||
|
||||
|
||||
class DocumentApiTest(APITestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
|
||||
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
|
||||
|
||||
os.makedirs(self.originals_dir, exist_ok=True)
|
||||
os.makedirs(self.thumbnail_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumbnail_dir
|
||||
).enable()
|
||||
|
||||
user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_login(user=user)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
|
||||
def testDocuments(self):
|
||||
|
||||
response = self.client.get("/api/documents/").data
|
||||
|
||||
self.assertEqual(response['count'], 0)
|
||||
|
||||
c = Correspondent.objects.create(name="c", pk=41)
|
||||
dt = DocumentType.objects.create(name="dt", pk=63)
|
||||
tag = Tag.objects.create(name="t", pk=85)
|
||||
|
||||
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
|
||||
|
||||
doc.tags.add(tag)
|
||||
|
||||
response = self.client.get("/api/documents/", format='json')
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['count'], 1)
|
||||
|
||||
returned_doc = response.data['results'][0]
|
||||
self.assertEqual(returned_doc['id'], doc.id)
|
||||
self.assertEqual(returned_doc['title'], doc.title)
|
||||
self.assertEqual(returned_doc['correspondent']['name'], c.name)
|
||||
self.assertEqual(returned_doc['document_type']['name'], dt.name)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], c.id)
|
||||
self.assertEqual(returned_doc['document_type']['id'], dt.id)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
|
||||
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
|
||||
self.assertEqual(len(returned_doc['tags']), 1)
|
||||
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
|
||||
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
|
||||
self.assertListEqual(returned_doc['tags_id'], [tag.id])
|
||||
|
||||
c2 = Correspondent.objects.create(name="c2")
|
||||
|
||||
returned_doc['correspondent_id'] = c2.pk
|
||||
returned_doc['title'] = "the new title"
|
||||
|
||||
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
doc_after_save = Document.objects.get(id=doc.id)
|
||||
|
||||
self.assertEqual(doc_after_save.correspondent, c2)
|
||||
self.assertEqual(doc_after_save.title, "the new title")
|
||||
|
||||
self.client.delete("/api/documents/{}/".format(doc_after_save.pk))
|
||||
|
||||
self.assertEqual(len(Document.objects.all()), 0)
|
||||
|
||||
def test_document_actions(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_thumbnail = b"thumbnail content"
|
||||
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
|
||||
|
||||
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
f.write(content_thumbnail)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_thumbnail)
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
response = self.client.get('/api/documents/{}/thumb/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
def test_document_filters(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||
|
||||
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||
tag_2 = Tag.objects.create(name="t2")
|
||||
tag_3 = Tag.objects.create(name="t3")
|
||||
|
||||
doc1.tags.add(tag_inbox)
|
||||
doc2.tags.add(tag_2)
|
||||
doc3.tags.add(tag_2)
|
||||
doc3.tags.add(tag_3)
|
||||
|
||||
response = self.client.get("/api/documents/?is_in_inbox=true")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]['id'], doc1.id)
|
||||
|
||||
response = self.client.get("/api/documents/?is_in_inbox=false")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['id'], doc2.id)
|
||||
self.assertEqual(results[1]['id'], doc3.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__in={},{}".format(tag_inbox.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0]['id'], doc1.id)
|
||||
self.assertEqual(results[1]['id'], doc3.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_2.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]['id'], doc3.id)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__all={},{}".format(tag_inbox.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
response = self.client.get("/api/documents/?tags__id__all={}a{}".format(tag_inbox.id, tag_3.id))
|
||||
self.assertEqual(response.status_code, 200)
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
def test_search_autocomplete(self, m):
|
||||
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 10)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test&limit=20")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 20)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=test&limit=-1")
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/")
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
response = self.client.get("/api/search/autocomplete/?term=")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 10)
|
||||
|
||||
def test_statistics(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||
|
||||
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||
|
||||
doc1.tags.add(tag_inbox)
|
||||
|
||||
response = self.client.get("/api/statistics/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['documents_total'], 3)
|
||||
self.assertEqual(response.data['documents_inbox'], 1)
|
@@ -2,9 +2,9 @@ import unittest
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from .factories import DocumentFactory
|
||||
from ..checks import changed_password_check
|
||||
from ..models import Document
|
||||
from .factories import DocumentFactory
|
||||
|
||||
|
||||
class ChecksTestCase(TestCase):
|
||||
|
85
src/documents/tests/test_classifier.py
Normal file
85
src/documents/tests/test_classifier.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import tempfile
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||
|
||||
|
||||
class TestClassifier(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
||||
self.classifier = DocumentClassifier()
|
||||
|
||||
def generate_test_data(self):
|
||||
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
self.c2 = Correspondent.objects.create(name="c2")
|
||||
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
|
||||
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
|
||||
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
|
||||
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
|
||||
self.doc_inbox = Document.objects.create(title="doc235", content="aa", checksum="C")
|
||||
|
||||
self.doc1.tags.add(self.t1)
|
||||
self.doc2.tags.add(self.t1)
|
||||
self.doc2.tags.add(self.t3)
|
||||
self.doc_inbox.tags.add(self.t2)
|
||||
|
||||
def testNoTrainingData(self):
|
||||
try:
|
||||
self.classifier.train()
|
||||
except ValueError as e:
|
||||
self.assertEqual(str(e), "No training data available.")
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def testEmpty(self):
|
||||
Document.objects.create(title="WOW", checksum="3457", content="ASD")
|
||||
self.classifier.train()
|
||||
self.assertIsNone(self.classifier.document_type_classifier)
|
||||
self.assertIsNone(self.classifier.tags_classifier)
|
||||
self.assertIsNone(self.classifier.correspondent_classifier)
|
||||
|
||||
self.assertListEqual(self.classifier.predict_tags(""), [])
|
||||
self.assertIsNone(self.classifier.predict_document_type(""))
|
||||
self.assertIsNone(self.classifier.predict_correspondent(""))
|
||||
|
||||
def testTrain(self):
|
||||
self.generate_test_data()
|
||||
self.classifier.train()
|
||||
self.assertListEqual(list(self.classifier.correspondent_classifier.classes_), [-1, self.c1.pk])
|
||||
self.assertListEqual(list(self.classifier.tags_binarizer.classes_), [self.t1.pk, self.t3.pk])
|
||||
|
||||
def testPredict(self):
|
||||
self.generate_test_data()
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
|
||||
|
||||
def testDatasetHashing(self):
|
||||
|
||||
self.generate_test_data()
|
||||
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
@override_settings(DATA_DIR=tempfile.mkdtemp())
|
||||
def testSaveClassifier(self):
|
||||
|
||||
self.generate_test_data()
|
||||
|
||||
self.classifier.train()
|
||||
|
||||
self.classifier.save_classifier()
|
||||
|
||||
new_classifier = DocumentClassifier()
|
||||
new_classifier.reload()
|
||||
self.assertFalse(new_classifier.train())
|
@@ -1,8 +1,15 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import FileInfo, Tag
|
||||
from ..consumer import Consumer, ConsumerError
|
||||
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
||||
from ..parsers import DocumentParser, ParseError
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
@@ -394,3 +401,254 @@ class TestFieldPermutations(TestCase):
|
||||
self.assertEqual(info.created.year, 2019)
|
||||
self.assertEqual(info.created.month, 9)
|
||||
self.assertEqual(info.created.day, 8)
|
||||
|
||||
|
||||
class DummyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(DummyParser, self).__init__(path, logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
return "The Text"
|
||||
|
||||
|
||||
class FaultyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(path, logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
class TestConsumer(TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
return DummyParser(path, logging_group, self.scratch_dir)
|
||||
|
||||
def make_faulty_parser(self, path, logging_group):
|
||||
return FaultyParser(path, logging_group, self.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.consumption_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
|
||||
CONSUMPTION_DIR=self.consumption_dir
|
||||
).enable()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_dummy_parser,
|
||||
"test": lambda _: True,
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.consumption_dir, ignore_errors=True)
|
||||
|
||||
def get_test_file(self):
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
|
||||
return f
|
||||
|
||||
def testNormalOperation(self):
|
||||
|
||||
filename = self.get_test_file()
|
||||
document = self.consumer.try_consume_file(filename)
|
||||
|
||||
self.assertEqual(document.content, "The Text")
|
||||
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
|
||||
self.assertIsNone(document.correspondent)
|
||||
self.assertIsNone(document.document_type)
|
||||
self.assertEqual(document.filename, "0000001.pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.source_path
|
||||
))
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.thumbnail_path
|
||||
))
|
||||
|
||||
self.assertFalse(os.path.isfile(filename))
|
||||
|
||||
def testOverrideFilename(self):
|
||||
filename = self.get_test_file()
|
||||
override_filename = "My Bank - Statement for November.pdf"
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
|
||||
|
||||
self.assertEqual(document.correspondent.name, "My Bank")
|
||||
self.assertEqual(document.title, "Statement for November")
|
||||
|
||||
def testOverrideTitle(self):
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
||||
self.assertEqual(document.title, "Override Title")
|
||||
|
||||
def testOverrideCorrespondent(self):
|
||||
c = Correspondent.objects.create(name="test")
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
|
||||
self.assertEqual(document.correspondent.id, c.id)
|
||||
|
||||
def testOverrideDocumentType(self):
|
||||
dt = DocumentType.objects.create(name="test")
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
|
||||
self.assertEqual(document.document_type.id, dt.id)
|
||||
|
||||
def testOverrideTags(self):
|
||||
t1 = Tag.objects.create(name="t1")
|
||||
t2 = Tag.objects.create(name="t2")
|
||||
t3 = Tag.objects.create(name="t3")
|
||||
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
|
||||
|
||||
self.assertIn(t1, document.tags.all())
|
||||
self.assertNotIn(t2, document.tags.all())
|
||||
self.assertIn(t3, document.tags.all())
|
||||
|
||||
def testNotAFile(self):
|
||||
try:
|
||||
self.consumer.try_consume_file("non-existing-file")
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith('It is not a file'))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR=None)
|
||||
def testConsumptionDirUnset(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="asd")
|
||||
def testNoConsumptionDir(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Consumption directory asd does not exist")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testNoParsers(self, m):
|
||||
m.return_value = []
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).startswith("No parsers abvailable"))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testFaultyParser(self, m):
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_faulty_parser,
|
||||
"test": lambda _: True,
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Does not compute.")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception.")
|
||||
|
||||
@mock.patch("documents.consumer.Consumer._write")
|
||||
def testPostSaveError(self, m):
|
||||
filename = self.get_test_file()
|
||||
m.side_effect = OSError("NO.")
|
||||
try:
|
||||
self.consumer.try_consume_file(filename)
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "NO.")
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
# file not deleted
|
||||
self.assertTrue(os.path.isfile(filename))
|
||||
|
||||
# Database empty
|
||||
self.assertEqual(len(Document.objects.all()), 0)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def testFilenameHandling(self):
|
||||
filename = self.get_test_file()
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||
|
||||
print(document.source_path)
|
||||
print("===")
|
||||
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
||||
|
||||
@mock.patch("documents.consumer.DocumentClassifier")
|
||||
def testClassifyDocument(self, m):
|
||||
correspondent = Correspondent.objects.create(name="test")
|
||||
dtype = DocumentType.objects.create(name="test")
|
||||
t1 = Tag.objects.create(name="t1")
|
||||
t2 = Tag.objects.create(name="t2")
|
||||
|
||||
m.return_value = MagicMock()
|
||||
m.return_value.predict_correspondent.return_value = correspondent.pk
|
||||
m.return_value.predict_document_type.return_value = dtype.pk
|
||||
m.return_value.predict_tags.return_value = [t1.pk]
|
||||
|
||||
document = self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
self.assertEqual(document.correspondent, correspondent)
|
||||
self.assertEqual(document.document_type, dtype)
|
||||
self.assertIn(t1, document.tags.all())
|
||||
self.assertNotIn(t2, document.tags.all())
|
||||
|
@@ -1,17 +1,14 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
from pathlib import Path
|
||||
from shutil import rmtree
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from django.utils.text import slugify
|
||||
from ..models import Tag, Document, Correspondent
|
||||
from django.conf import settings
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
|
||||
from ..models import Document, Correspondent
|
||||
from ..signals.handlers import update_filename_and_move_files
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
@@ -31,18 +28,6 @@ class TestDate(TestCase):
|
||||
for dirname in self.deletion_list:
|
||||
shutil.rmtree(dirname, ignore_errors=True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_source_filename(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(document.source_filename, "0000001.pdf")
|
||||
|
||||
document.filename = "test.pdf"
|
||||
self.assertEqual(document.source_filename, "test.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
document = Document()
|
||||
@@ -50,58 +35,50 @@ class TestDate(TestCase):
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
|
||||
self.assertEqual(generate_filename(document), "{:07d}.pdf".format(document.pk))
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"0000001.pdf.gpg")
|
||||
self.assertEqual(generate_filename(document),
|
||||
"{:07d}.pdf.gpg".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
# Test default source_path
|
||||
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/{:07d}.pdf".format(document.pk))
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
# Enable encryption and check again
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf.gpg")
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf.gpg".format(document.pk))
|
||||
|
||||
document.save()
|
||||
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), True)
|
||||
# test that creating dirs for the source_path creates the correct directory
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/test/test-0000001.pdf.gpg"), True)
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"test/test-0000001.pdf.gpg")
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/test/test-{:07d}.pdf.gpg".format(document.pk)), True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
@@ -109,34 +86,67 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
self.assertEqual(document.source_path, settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
# Make the folder read- and execute-only (no writing and no renaming)
|
||||
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o555)
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o555)
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/none/none-0000001.pdf"), True)
|
||||
self.assertEqual(document.source_filename,
|
||||
"none/none-0000001.pdf")
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
os.chmod(settings.MEDIA_ROOT + "/documents/originals/none", 0o777)
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_database_error(self):
|
||||
|
||||
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
|
||||
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.checksum = "BBBBB"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
|
||||
# This will cause save() to fail.
|
||||
document.checksum = document1.checksum
|
||||
|
||||
# Assume saving the document initially works, this gets called.
|
||||
# After renaming, an error occurs, and filename is not saved:
|
||||
# document should still be available at document.filename.
|
||||
update_filename_and_move_files(None, document)
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
@@ -144,21 +154,20 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Ensure file deletion after delete
|
||||
pk = document.pk
|
||||
document.delete()
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(pk)), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete_nofile(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
@@ -167,8 +176,7 @@ class TestDate(TestCase):
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_directory_not_empty(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
@@ -176,28 +184,24 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename,
|
||||
"none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
Path(document.source_path).touch()
|
||||
Path(document.source_path + "test").touch()
|
||||
important_file = document.source_path + "test"
|
||||
Path(important_file).touch()
|
||||
|
||||
# Set a correspondent and save the document
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
document.correspondent = Correspondent.objects.get_or_create(name="test")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), True)
|
||||
|
||||
# Cleanup
|
||||
os.remove(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdftest")
|
||||
os.rmdir(settings.MEDIA_ROOT + "/documents/originals/none")
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
|
||||
self.assertTrue(os.path.isfile(important_file))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_underscore(self):
|
||||
@@ -212,13 +216,8 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"demo-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_dash(self):
|
||||
@@ -233,13 +232,8 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"demo-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_malformed(self):
|
||||
@@ -254,13 +248,8 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_all(self):
|
||||
@@ -274,64 +263,25 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"demo-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
self.assertEqual(generate_filename(document),
|
||||
"demo-{:07d}.pdf".format(document.pk))
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_out_of_bounds_0(self):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
|
||||
def test_tags_out_of_bounds(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[10000000]}")
|
||||
def test_tags_out_of_bounds_10000000(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
# Add tag to document
|
||||
document.tags.create(name="demo")
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
self.assertEqual(generate_filename(document),
|
||||
"none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[99]}")
|
||||
def test_tags_out_of_bounds_99(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
document.delete()
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}/{correspondent}")
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
@@ -339,153 +289,34 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
document.filename = generate_filename(document)
|
||||
self.assertEqual(document.filename, "none/none/none-{:07d}.pdf".format(document.pk))
|
||||
create_source_path_directory(document.source_path)
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), True)
|
||||
|
||||
pk = document.pk
|
||||
document.delete()
|
||||
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none/none-0000001.pdf"),
|
||||
False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals"), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none/none-{:07d}.pdf".format(pk)), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR), True)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def test_format_none(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
self.assertEqual(document.generate_source_filename(), "0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_renamed(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
|
||||
# Rename the document "illegaly"
|
||||
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
|
||||
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"none/none-0000001.pdf",
|
||||
settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"test/test-0000001.pdf")
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/test/test-0000001.pdf"), True)
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/none/none-0000001.pdf"), False)
|
||||
|
||||
# Set new correspondent and expect document to be saved properly
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="foo")[0]
|
||||
document.save()
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/foo/foo-0000001.pdf"), True)
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/foo"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), False)
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"foo/foo-0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_renamed_encrypted(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf.gpg")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf.gpg")
|
||||
|
||||
# Rename the document "illegaly"
|
||||
os.makedirs(settings.MEDIA_ROOT + "/documents/originals/test")
|
||||
os.rename(settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"none/none-0000001.pdf.gpg",
|
||||
settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"test/test-0000001.pdf.gpg")
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/test/test-0000001.pdf.gpg"), True)
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/none/none-0000001.pdf"), False)
|
||||
|
||||
# Set new correspondent and expect document to be saved properly
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="foo")[0]
|
||||
document.save()
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/" +
|
||||
"originals/foo/foo-0000001.pdf.gpg"), True)
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/foo"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), False)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/test"), False)
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"foo/foo-0000001.pdf.gpg")
|
||||
|
||||
def test_delete_all_empty_subdirectories(self):
|
||||
# Create our working directory
|
||||
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
os.makedirs(os.path.join(tmp, "empty"))
|
||||
os.makedirs(os.path.join(tmp, "empty", "subdirectory"))
|
||||
|
||||
os.makedirs(os.path.join(tmp, "notempty"))
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
|
||||
Document.delete_all_empty_subdirectories(tmp)
|
||||
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "empty")), False)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
def test_try_delete_empty_directories(self):
|
||||
# Create our working directory
|
||||
tmp = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
@@ -493,67 +324,27 @@ class TestDate(TestCase):
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
os.makedirs(os.path.join(tmp, "notempty", "empty"))
|
||||
|
||||
Document.try_delete_empty_directories(
|
||||
os.path.join(tmp, "notempty", "empty"))
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
self.assertEqual(os.path.isdir(
|
||||
os.path.join(tmp, "notempty", "empty")), False)
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_document_accidentally_deleted(self):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{created/[title]")
|
||||
def test_invalid_format(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
# Test source_path
|
||||
self.assertEqual(document.source_path, settings.MEDIA_ROOT +
|
||||
"/documents/originals/none/none-0000001.pdf")
|
||||
|
||||
# Delete the document "illegaly"
|
||||
os.remove(settings.MEDIA_ROOT + "/documents/originals/" +
|
||||
"none/none-0000001.pdf")
|
||||
|
||||
# Set new correspondent and expect document to be saved properly
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="foo")[0]
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT +
|
||||
"/documents/originals/none"), True)
|
||||
self.assertEqual(document.source_filename,
|
||||
"none/none-0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/" +
|
||||
"{correspondent}")
|
||||
def test_set_filename(self):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{created__year}")
|
||||
def test_invalid_format_key(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
# Ensure that filename is properly generated
|
||||
tmp = document.source_filename
|
||||
self.assertEqual(document.generate_source_filename(),
|
||||
"none/none-0000001.pdf")
|
||||
document.create_source_directory()
|
||||
Path(document.source_path).touch()
|
||||
|
||||
# Set existing filename
|
||||
document.set_filename(tmp)
|
||||
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
|
||||
|
||||
# Set non-existing filename
|
||||
document.set_filename("doesnotexist")
|
||||
self.assertEqual(document.source_filename, "none/none-0000001.pdf")
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
@@ -1,9 +1,8 @@
|
||||
from django.core.management.base import CommandError
|
||||
from django.test import TestCase
|
||||
|
||||
from ..management.commands.document_importer import Command
|
||||
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from ..management.commands.document_importer import Command
|
||||
|
||||
|
||||
class TestImporter(TestCase):
|
||||
|
@@ -1,6 +1,5 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
|
@@ -1,91 +0,0 @@
|
||||
import base64
|
||||
import os
|
||||
import magic
|
||||
|
||||
from hashlib import md5
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
|
||||
from ..mail import Message, Attachment
|
||||
|
||||
|
||||
class TestMessage(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
self.sample = os.path.join(
|
||||
settings.BASE_DIR,
|
||||
"documents",
|
||||
"tests",
|
||||
"samples",
|
||||
"mail.txt"
|
||||
)
|
||||
|
||||
def test_init(self):
|
||||
|
||||
with open(self.sample, "rb") as f:
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
message = Message(f.read())
|
||||
|
||||
self.assertTrue(message)
|
||||
self.assertEqual(message.subject, "Test 0")
|
||||
|
||||
data = message.attachment.read()
|
||||
|
||||
self.assertEqual(
|
||||
md5(data).hexdigest(), "7c89655f9e9eb7dd8cde8568e8115d59")
|
||||
|
||||
self.assertEqual(
|
||||
message.attachment.content_type, "application/pdf")
|
||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
||||
self.assertEqual(m.id_buffer(data), "application/pdf")
|
||||
|
||||
|
||||
class TestInlineMessage(TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
TestCase.__init__(self, *args, **kwargs)
|
||||
self.sample = os.path.join(
|
||||
settings.BASE_DIR,
|
||||
"documents",
|
||||
"tests",
|
||||
"samples",
|
||||
"inline_mail.txt"
|
||||
)
|
||||
|
||||
def test_init(self):
|
||||
|
||||
with open(self.sample, "rb") as f:
|
||||
|
||||
with mock.patch("logging.StreamHandler.emit") as __:
|
||||
message = Message(f.read())
|
||||
|
||||
self.assertTrue(message)
|
||||
self.assertEqual(message.subject, "Paperless Inline Image")
|
||||
|
||||
data = message.attachment.read()
|
||||
|
||||
self.assertEqual(
|
||||
md5(data).hexdigest(), "30c00a7b42913e65f7fdb0be40b9eef3")
|
||||
|
||||
self.assertEqual(
|
||||
message.attachment.content_type, "image/png")
|
||||
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
|
||||
self.assertEqual(m.id_buffer(data), "image/png")
|
||||
|
||||
|
||||
class TestAttachment(TestCase):
|
||||
|
||||
def test_init(self):
|
||||
data = base64.encodebytes(b"0")
|
||||
self.assertEqual(Attachment(data, "application/pdf").suffix, "pdf")
|
||||
self.assertEqual(Attachment(data, "image/png").suffix, "png")
|
||||
self.assertEqual(Attachment(data, "image/jpeg").suffix, "jpeg")
|
||||
self.assertEqual(Attachment(data, "image/gif").suffix, "gif")
|
||||
self.assertEqual(Attachment(data, "image/tiff").suffix, "tiff")
|
||||
self.assertEqual(Attachment(data, "image/png").read(), data)
|
@@ -1,7 +1,7 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
from .factories import DocumentFactory, CorrespondentFactory
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class CorrespondentTestCase(TestCase):
|
||||
|
@@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser}),
|
||||
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
||||
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_0_parsers(self, m, *args):
|
||||
m.return_value = ((None, lambda _: None),)
|
||||
m.return_value = []
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertIsNone(
|
||||
get_parser_class("doc.pdf")
|
||||
|
@@ -1,14 +1,9 @@
|
||||
from django.db.models import Count, Max
|
||||
from django.http import HttpResponse, HttpResponseBadRequest
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
from django.views.decorators.cache import cache_control
|
||||
from django.views.generic import TemplateView
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.views import APIView
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from paperless.views import StandardPagination
|
||||
from rest_framework.filters import OrderingFilter, SearchFilter
|
||||
from rest_framework.mixins import (
|
||||
DestroyModelMixin,
|
||||
@@ -17,12 +12,17 @@ from rest_framework.mixins import (
|
||||
UpdateModelMixin
|
||||
)
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.views import APIView
|
||||
from rest_framework.viewsets import (
|
||||
GenericViewSet,
|
||||
ModelViewSet,
|
||||
ReadOnlyModelViewSet
|
||||
)
|
||||
|
||||
import documents.index as index
|
||||
from paperless.db import GnuPG
|
||||
from paperless.views import StandardPagination
|
||||
from .filters import (
|
||||
CorrespondentFilterSet,
|
||||
DocumentFilterSet,
|
||||
@@ -30,8 +30,6 @@ from .filters import (
|
||||
DocumentTypeFilterSet,
|
||||
LogFilterSet
|
||||
)
|
||||
|
||||
import documents.index as index
|
||||
from .forms import UploadForm
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||
from .serialisers import (
|
||||
@@ -54,7 +52,7 @@ class CorrespondentViewSet(ModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = CorrespondentFilterSet
|
||||
filterset_class = CorrespondentFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
|
||||
|
||||
|
||||
@@ -65,7 +63,7 @@ class TagViewSet(ModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = TagFilterSet
|
||||
filterset_class = TagFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||
|
||||
|
||||
@@ -76,7 +74,7 @@ class DocumentTypeViewSet(ModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = DocumentTypeFilterSet
|
||||
filterset_class = DocumentTypeFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
|
||||
|
||||
|
||||
@@ -91,7 +89,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
|
||||
filter_class = DocumentFilterSet
|
||||
filterset_class = DocumentFilterSet
|
||||
search_fields = ("title", "correspondent__name", "content")
|
||||
ordering_fields = (
|
||||
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
||||
@@ -106,7 +104,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
#TODO: this should not be necessary here.
|
||||
# TODO: this should not be necessary here.
|
||||
content_types = {
|
||||
Document.TYPE_PDF: "application/pdf",
|
||||
Document.TYPE_PNG: "image/png",
|
||||
@@ -114,7 +112,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
Document.TYPE_GIF: "image/gif",
|
||||
Document.TYPE_TIF: "image/tiff",
|
||||
Document.TYPE_CSV: "text/csv",
|
||||
Document.TYPE_MD: "text/markdown",
|
||||
Document.TYPE_MD: "text/markdown",
|
||||
Document.TYPE_TXT: "text/plain"
|
||||
}
|
||||
|
||||
@@ -132,7 +130,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
|
||||
@action(methods=['post'], detail=False)
|
||||
def post_document(self, request, pk=None):
|
||||
#TODO: is this a good implementation?
|
||||
# TODO: is this a good implementation?
|
||||
form = UploadForm(data=request.POST, files=request.FILES)
|
||||
if form.is_valid():
|
||||
form.save()
|
||||
@@ -142,17 +140,26 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
response = self.file_response(pk, "inline")
|
||||
return response
|
||||
try:
|
||||
response = self.file_response(pk, "inline")
|
||||
return response
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document source file does not exist")
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
@cache_control(public=False, max_age=315360000)
|
||||
def thumb(self, request, pk=None):
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
|
||||
try:
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document thumbnail does not exist")
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def download(self, request, pk=None):
|
||||
return self.file_response(pk, "attachment")
|
||||
try:
|
||||
return self.file_response(pk, "attachment")
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document source file does not exist")
|
||||
|
||||
|
||||
class LogViewSet(ReadOnlyModelViewSet):
|
||||
@@ -163,7 +170,7 @@ class LogViewSet(ReadOnlyModelViewSet):
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filter_class = LogFilterSet
|
||||
filterset_class = LogFilterSet
|
||||
ordering_fields = ("created",)
|
||||
|
||||
|
||||
@@ -191,13 +198,12 @@ class SearchView(APIView):
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
result_page = index.query_page(self.ix, query, page)
|
||||
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
with index.query_page(self.ix, query, page) as result_page:
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
|
||||
else:
|
||||
return Response({
|
||||
@@ -217,17 +223,16 @@ class SearchAutoCompleteView(APIView):
|
||||
if 'term' in request.query_params:
|
||||
term = request.query_params['term']
|
||||
else:
|
||||
term = None
|
||||
return HttpResponseBadRequest("Term required")
|
||||
|
||||
if 'limit' in request.query_params:
|
||||
limit = int(request.query_params['limit'])
|
||||
if limit <= 0:
|
||||
return HttpResponseBadRequest("Invalid limit")
|
||||
else:
|
||||
limit = 10
|
||||
|
||||
if term is not None:
|
||||
return Response(index.autocomplete(self.ix, term, limit))
|
||||
else:
|
||||
return Response([])
|
||||
return Response(index.autocomplete(self.ix, term, limit))
|
||||
|
||||
|
||||
class StatisticsView(APIView):
|
||||
|
@@ -11,6 +11,8 @@ writeable_hint = (
|
||||
"Set the permissions of {} to be writeable by the user running the "
|
||||
"Paperless services"
|
||||
)
|
||||
|
||||
|
||||
def path_check(env_var):
|
||||
messages = []
|
||||
directory = os.getenv(env_var)
|
||||
@@ -27,6 +29,7 @@ def path_check(env_var):
|
||||
))
|
||||
return messages
|
||||
|
||||
|
||||
@register()
|
||||
def paths_check(app_configs, **kwargs):
|
||||
"""
|
||||
@@ -34,9 +37,9 @@ def paths_check(app_configs, **kwargs):
|
||||
"""
|
||||
|
||||
check_messages = path_check("PAPERLESS_DATA_DIR") + \
|
||||
path_check("PAPERLESS_MEDIA_ROOT") + \
|
||||
path_check("PAPERLESS_CONSUMPTION_DIR") + \
|
||||
path_check("PAPERLESS_STATICDIR")
|
||||
path_check("PAPERLESS_MEDIA_ROOT") + \
|
||||
path_check("PAPERLESS_CONSUMPTION_DIR") + \
|
||||
path_check("PAPERLESS_STATICDIR")
|
||||
|
||||
return check_messages
|
||||
|
||||
@@ -64,3 +67,16 @@ def binaries_check(app_configs, **kwargs):
|
||||
check_messages.append(Warning(error.format(binary), hint))
|
||||
|
||||
return check_messages
|
||||
|
||||
|
||||
@register()
|
||||
def debug_mode_check(app_configs, **kwargs):
|
||||
if settings.DEBUG:
|
||||
return [Warning(
|
||||
"DEBUG mode is enabled. Disable Debug mode. This is a serious "
|
||||
"security issue, since it puts security overides in place which "
|
||||
"are meant to be only used during development. This "
|
||||
"also means that paperless will tell anyone various "
|
||||
"debugging information when something goes wrong.")]
|
||||
else:
|
||||
return []
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import json
|
||||
import math
|
||||
import multiprocessing
|
||||
import os
|
||||
import re
|
||||
@@ -13,6 +14,18 @@ elif os.path.exists("/etc/paperless.conf"):
|
||||
elif os.path.exists("/usr/local/etc/paperless.conf"):
|
||||
load_dotenv("/usr/local/etc/paperless.conf")
|
||||
|
||||
# There are multiple levels of concurrency in paperless:
|
||||
# - Multiple consumers may be run in parallel.
|
||||
# - Each consumer may process multiple pages in parallel.
|
||||
# - Each Tesseract OCR run may spawn multiple threads to process a single page
|
||||
# slightly faster.
|
||||
# The performance gains from having tesseract use multiple threads are minimal.
|
||||
# However, when multiple pages are processed in parallel, the total number of
|
||||
# OCR threads may exceed the number of available cpu cores, which will
|
||||
# dramatically slow down the consumption process. This settings limits each
|
||||
# Tesseract process to one thread.
|
||||
os.environ['OMP_THREAD_LIMIT'] = "1"
|
||||
|
||||
|
||||
def __get_boolean(key, default="NO"):
|
||||
"""
|
||||
@@ -21,9 +34,11 @@ def __get_boolean(key, default="NO"):
|
||||
"""
|
||||
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
# NEVER RUN WITH DEBUG IN PRODUCTION.
|
||||
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Directories #
|
||||
###############################################################################
|
||||
@@ -65,6 +80,7 @@ INSTALLED_APPS = [
|
||||
"documents.apps.DocumentsConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
|
||||
"django.contrib.admin",
|
||||
|
||||
@@ -139,11 +155,11 @@ else:
|
||||
X_FRAME_OPTIONS = 'SAMEORIGIN'
|
||||
|
||||
# We allow CORS from localhost:8080
|
||||
CORS_ORIGIN_WHITELIST = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8080,https://localhost:8080").split(","))
|
||||
CORS_ALLOWED_ORIGINS = tuple(os.getenv("PAPERLESS_CORS_ALLOWED_HOSTS", "http://localhost:8000").split(","))
|
||||
|
||||
if DEBUG:
|
||||
# Allow access from the angular development server during debugging
|
||||
CORS_ORIGIN_WHITELIST += ('http://localhost:4200',)
|
||||
CORS_ALLOWED_ORIGINS += ('http://localhost:4200',)
|
||||
|
||||
# The secret key has a default that should be fine so long as you're hosting
|
||||
# Paperless on a closed network. However, if you're putting this anywhere
|
||||
@@ -195,11 +211,11 @@ DATABASES = {
|
||||
}
|
||||
}
|
||||
|
||||
# Always have sqlite available as a second option for management commands
|
||||
# This is important when migrating to/from sqlite
|
||||
DATABASES['sqlite'] = DATABASES['default'].copy()
|
||||
|
||||
if os.getenv("PAPERLESS_DBHOST"):
|
||||
# Have sqlite available as a second option for management commands
|
||||
# This is important when migrating to/from sqlite
|
||||
DATABASES['sqlite'] = DATABASES['default'].copy()
|
||||
|
||||
DATABASES["default"] = {
|
||||
"ENGINE": "django.db.backends.postgresql_psycopg2",
|
||||
"HOST": os.getenv("PAPERLESS_DBHOST"),
|
||||
@@ -244,6 +260,14 @@ LOGGING = {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
},
|
||||
"paperless_mail": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
},
|
||||
"paperless_tesseract": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -251,22 +275,60 @@ LOGGING = {
|
||||
# Task queue #
|
||||
###############################################################################
|
||||
|
||||
|
||||
# Sensible defaults for multitasking:
|
||||
# use a fair balance between worker processes and threads epr worker so that
|
||||
# both consuming many documents in parallel and consuming large documents is
|
||||
# reasonably fast.
|
||||
# Favors threads per worker on smaller systems and never exceeds cpu_count()
|
||||
# in total.
|
||||
|
||||
def default_task_workers():
|
||||
try:
|
||||
return max(
|
||||
math.floor(math.sqrt(multiprocessing.cpu_count())),
|
||||
1
|
||||
)
|
||||
except NotImplementedError:
|
||||
return 1
|
||||
|
||||
|
||||
TASK_WORKERS = int(os.getenv("PAPERLESS_TASK_WORKERS", default_task_workers()))
|
||||
|
||||
Q_CLUSTER = {
|
||||
'name': 'paperless',
|
||||
'catch_up': False,
|
||||
'workers': TASK_WORKERS,
|
||||
'redis': os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
|
||||
}
|
||||
|
||||
|
||||
def default_threads_per_worker():
|
||||
try:
|
||||
return max(
|
||||
math.floor(multiprocessing.cpu_count() / TASK_WORKERS),
|
||||
1
|
||||
)
|
||||
except NotImplementedError:
|
||||
return 1
|
||||
|
||||
|
||||
THREADS_PER_WORKER = os.getenv("PAPERLESS_THREADS_PER_WORKER", default_threads_per_worker())
|
||||
|
||||
###############################################################################
|
||||
# Paperless Specific Settings #
|
||||
###############################################################################
|
||||
|
||||
CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
|
||||
|
||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||
|
||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# The amount of threads to use for OCR
|
||||
OCR_THREADS = int(os.getenv("PAPERLESS_OCR_THREADS", multiprocessing.cpu_count()))
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||
@@ -311,6 +373,7 @@ FILENAME_PARSE_TRANSFORMS = []
|
||||
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
|
||||
FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
|
||||
|
||||
# TODO: this should not have a prefix.
|
||||
# Specify the filename format for out files
|
||||
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
from django.conf.urls import include, url
|
||||
from django.conf.urls import include
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.decorators import login_required
|
||||
from django.urls import path, re_path
|
||||
@@ -7,7 +7,6 @@ from django.views.generic import RedirectView
|
||||
from rest_framework.routers import DefaultRouter
|
||||
|
||||
from paperless.consumers import StatusConsumer
|
||||
from paperless.views import FaviconView
|
||||
from documents.views import (
|
||||
CorrespondentViewSet,
|
||||
DocumentViewSet,
|
||||
@@ -19,6 +18,7 @@ from documents.views import (
|
||||
SearchAutoCompleteView,
|
||||
StatisticsView
|
||||
)
|
||||
from paperless.views import FaviconView
|
||||
|
||||
api_router = DefaultRouter()
|
||||
api_router.register(r"correspondents", CorrespondentViewSet)
|
||||
@@ -31,32 +31,32 @@ api_router.register(r"tags", TagViewSet)
|
||||
urlpatterns = [
|
||||
|
||||
# API
|
||||
url(r"^api/auth/",include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
|
||||
url(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
|
||||
url(r"^api/search/", SearchView.as_view(), name="search"),
|
||||
url(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
|
||||
url(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
|
||||
re_path(r"^api/auth/", include(('rest_framework.urls', 'rest_framework'), namespace="rest_framework")),
|
||||
re_path(r"^api/search/autocomplete/", SearchAutoCompleteView.as_view(), name="autocomplete"),
|
||||
re_path(r"^api/search/", SearchView.as_view(), name="search"),
|
||||
re_path(r"^api/statistics/", StatisticsView.as_view(), name="statistics"),
|
||||
re_path(r"^api/", include((api_router.urls, 'drf'), namespace="drf")),
|
||||
|
||||
# Favicon
|
||||
url(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||
|
||||
# The Django admin
|
||||
url(r"admin/", admin.site.urls),
|
||||
re_path(r"admin/", admin.site.urls),
|
||||
|
||||
# These redirects are here to support clients that use the old FetchView.
|
||||
url(
|
||||
re_path(
|
||||
r"^fetch/doc/(?P<pk>\d+)$",
|
||||
RedirectView.as_view(url='/api/documents/%(pk)s/download/'),
|
||||
),
|
||||
url(
|
||||
re_path(
|
||||
r"^fetch/thumb/(?P<pk>\d+)$",
|
||||
RedirectView.as_view(url='/api/documents/%(pk)s/thumb/'),
|
||||
),
|
||||
url(
|
||||
re_path(
|
||||
r"^fetch/preview/(?P<pk>\d+)$",
|
||||
RedirectView.as_view(url='/api/documents/%(pk)s/preview/'),
|
||||
),
|
||||
url(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
|
||||
re_path(r"^push$", csrf_exempt(RedirectView.as_view(url='/api/documents/post_document/'))),
|
||||
|
||||
# Frontend assets TODO: this is pretty bad.
|
||||
path('assets/<path:path>', RedirectView.as_view(url='/static/frontend/assets/%(path)s')),
|
||||
@@ -64,7 +64,7 @@ urlpatterns = [
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
|
||||
# Root of the Frontent
|
||||
url(r".*", login_required(IndexView.as_view())),
|
||||
re_path(r".*", login_required(IndexView.as_view())),
|
||||
|
||||
]
|
||||
|
||||
@@ -74,8 +74,8 @@ websocket_urlpatterns = [
|
||||
]
|
||||
|
||||
# Text in each page's <h1> (and above login form).
|
||||
admin.site.site_header = 'Paperless'
|
||||
admin.site.site_header = 'Paperless-ng'
|
||||
# Text at the end of each page's <title>.
|
||||
admin.site.site_title = 'Paperless'
|
||||
admin.site.site_title = 'Paperless-ng'
|
||||
# Text at the top of the admin index page.
|
||||
admin.site.index_title = 'Paperless administration'
|
||||
admin.site.index_title = 'Paperless-ng administration'
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (1, 0, 0)
|
||||
__version__ = (0, 9, 1)
|
||||
|
0
src/paperless_mail/__init__.py
Normal file
0
src/paperless_mail/__init__.py
Normal file
18
src/paperless_mail/admin.py
Normal file
18
src/paperless_mail/admin.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from django.contrib import admin
|
||||
from paperless_mail.models import MailAccount, MailRule
|
||||
|
||||
|
||||
class MailAccountAdmin(admin.ModelAdmin):
|
||||
|
||||
list_display = ("name", "imap_server", "username")
|
||||
|
||||
|
||||
class MailRuleAdmin(admin.ModelAdmin):
|
||||
|
||||
list_filter = ("account",)
|
||||
|
||||
list_display = ("name", "account", "folder", "action")
|
||||
|
||||
|
||||
admin.site.register(MailAccount, MailAccountAdmin)
|
||||
admin.site.register(MailRule, MailRuleAdmin)
|
7
src/paperless_mail/apps.py
Normal file
7
src/paperless_mail/apps.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class PaperlessMailConfig(AppConfig):
|
||||
name = 'paperless_mail'
|
||||
|
||||
verbose_name = 'Paperless Mail'
|
279
src/paperless_mail/mail.py
Normal file
279
src/paperless_mail/mail.py
Normal file
@@ -0,0 +1,279 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import timedelta, date
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||
MailboxFolderSelectError
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.models import Correspondent
|
||||
from paperless_mail.models import MailAccount, MailRule
|
||||
|
||||
|
||||
class MailError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BaseMailAction:
|
||||
|
||||
def get_criteria(self):
|
||||
return {}
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
pass
|
||||
|
||||
|
||||
class DeleteMailAction(BaseMailAction):
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.delete(message_uids)
|
||||
|
||||
|
||||
class MarkReadMailAction(BaseMailAction):
|
||||
|
||||
def get_criteria(self):
|
||||
return {'seen': False}
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.seen(message_uids, True)
|
||||
|
||||
|
||||
class MoveMailAction(BaseMailAction):
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.move(message_uids, parameter)
|
||||
|
||||
|
||||
class FlagMailAction(BaseMailAction):
|
||||
|
||||
def get_criteria(self):
|
||||
return {'flagged': False}
|
||||
|
||||
def post_consume(self, M, message_uids, parameter):
|
||||
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
|
||||
|
||||
|
||||
def get_rule_action(rule):
|
||||
if rule.action == MailRule.ACTION_FLAG:
|
||||
return FlagMailAction()
|
||||
elif rule.action == MailRule.ACTION_DELETE:
|
||||
return DeleteMailAction()
|
||||
elif rule.action == MailRule.ACTION_MOVE:
|
||||
return MoveMailAction()
|
||||
elif rule.action == MailRule.ACTION_MARK_READ:
|
||||
return MarkReadMailAction()
|
||||
else:
|
||||
raise ValueError("Unknown action.")
|
||||
|
||||
|
||||
def make_criterias(rule):
|
||||
maximum_age = date.today() - timedelta(days=rule.maximum_age)
|
||||
criterias = {
|
||||
"date_gte": maximum_age
|
||||
}
|
||||
if rule.filter_from:
|
||||
criterias["from_"] = rule.filter_from
|
||||
if rule.filter_subject:
|
||||
criterias["subject"] = rule.filter_subject
|
||||
if rule.filter_body:
|
||||
criterias["body"] = rule.filter_body
|
||||
|
||||
return {**criterias, **get_rule_action(rule).get_criteria()}
|
||||
|
||||
|
||||
def get_title(message, att, rule):
|
||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||
title = message.subject
|
||||
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||
title = os.path.splitext(os.path.basename(att.filename))[0]
|
||||
else:
|
||||
raise ValueError("Unknown title selector.")
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def get_correspondent(message, rule):
|
||||
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||
correspondent = None
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||
correspondent_name = message.from_
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||
if message.from_values and \
|
||||
'name' in message.from_values \
|
||||
and message.from_values['name']:
|
||||
correspondent_name = message.from_values['name']
|
||||
else:
|
||||
correspondent_name = message.from_
|
||||
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||
correspondent = rule.assign_correspondent
|
||||
else:
|
||||
raise ValueError("Unknwown correspondent selector")
|
||||
|
||||
return correspondent
|
||||
|
||||
|
||||
def get_mailbox(server, port, security):
|
||||
if security == MailAccount.IMAP_SECURITY_NONE:
|
||||
mailbox = MailBoxUnencrypted(server, port)
|
||||
elif security == MailAccount.IMAP_SECURITY_STARTTLS:
|
||||
mailbox = MailBox(server, port, starttls=True)
|
||||
elif security == MailAccount.IMAP_SECURITY_SSL:
|
||||
mailbox = MailBox(server, port)
|
||||
else:
|
||||
raise ValueError("Unknown IMAP security")
|
||||
return mailbox
|
||||
|
||||
|
||||
class MailAccountHandler(LoggingMixin):
|
||||
|
||||
def handle_mail_account(self, account):
|
||||
|
||||
self.renew_logging_group()
|
||||
|
||||
self.log('debug', f"Processing mail account {account}")
|
||||
|
||||
total_processed_files = 0
|
||||
|
||||
with get_mailbox(account.imap_server,
|
||||
account.imap_port,
|
||||
account.imap_security) as M:
|
||||
|
||||
try:
|
||||
M.login(account.username, account.password)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Error while authenticating account {account.name}")
|
||||
|
||||
self.log('debug', f"Account {account}: Processing "
|
||||
f"{account.rules.count()} rule(s)")
|
||||
|
||||
for rule in account.rules.all():
|
||||
self.log(
|
||||
'debug',
|
||||
f"Account {account}: Processing rule {rule.name}")
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
|
||||
|
||||
try:
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Folder {rule.folder} does not exist "
|
||||
f"in account {account.name}")
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Searching folder with criteria "
|
||||
f"{str(AND(**criterias))}")
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while fetching folder "
|
||||
f"{rule.folder} of account {account.name}")
|
||||
|
||||
post_consume_messages = []
|
||||
|
||||
mails_processed = 0
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
processed_files = self.handle_message(message, rule)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing mail "
|
||||
f"{message.uid} of account {account.name}")
|
||||
if processed_files > 0:
|
||||
post_consume_messages.append(message.uid)
|
||||
|
||||
total_processed_files += processed_files
|
||||
mails_processed += 1
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Processed {mails_processed} "
|
||||
f"matching mail(s)")
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Running mail actions on "
|
||||
f"{len(post_consume_messages)} mails")
|
||||
|
||||
try:
|
||||
get_rule_action(rule).post_consume(
|
||||
M,
|
||||
post_consume_messages,
|
||||
rule.action_parameter)
|
||||
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing post-consume "
|
||||
f"actions for account {account.name}")
|
||||
|
||||
return total_processed_files
|
||||
|
||||
def handle_message(self, message, rule):
|
||||
if not message.attachments:
|
||||
return 0
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Processing mail {message.subject} from {message.from_} with "
|
||||
f"{len(message.attachments)} attachment(s)")
|
||||
|
||||
correspondent = get_correspondent(message, rule)
|
||||
tag = rule.assign_tag
|
||||
doc_type = rule.assign_document_type
|
||||
|
||||
processed_attachments = 0
|
||||
|
||||
for att in message.attachments:
|
||||
|
||||
title = get_title(message, att, rule)
|
||||
|
||||
# TODO: check with parsers what files types are supported
|
||||
if att.content_type == 'application/pdf':
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
||||
with open(temp_filename, 'wb') as f:
|
||||
f.write(att.payload)
|
||||
|
||||
self.log(
|
||||
'info',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Consuming attachment {att.filename} from mail "
|
||||
f"{message.subject} from {message.from_}")
|
||||
|
||||
async_task(
|
||||
"documents.tasks.consume_file",
|
||||
path=temp_filename,
|
||||
override_filename=att.filename,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent.id if correspondent else None,
|
||||
override_document_type_id=doc_type.id if doc_type else None,
|
||||
override_tag_ids=[tag.id] if tag else None,
|
||||
task_name=f"Mail: {att.filename}"
|
||||
)
|
||||
|
||||
processed_attachments += 1
|
||||
|
||||
return processed_attachments
|
0
src/paperless_mail/management/__init__.py
Normal file
0
src/paperless_mail/management/__init__.py
Normal file
0
src/paperless_mail/management/commands/__init__.py
Normal file
0
src/paperless_mail/management/commands/__init__.py
Normal file
13
src/paperless_mail/management/commands/mail_fetcher.py
Normal file
13
src/paperless_mail/management/commands/mail_fetcher.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from paperless_mail import tasks
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
""".replace(" ", "")
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
tasks.process_mail_accounts()
|
48
src/paperless_mail/migrations/0001_initial.py
Normal file
48
src/paperless_mail/migrations/0001_initial.py
Normal file
@@ -0,0 +1,48 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-15 22:54
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('documents', '1002_auto_20201111_1105'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='MailAccount',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=256, unique=True)),
|
||||
('imap_server', models.CharField(max_length=256)),
|
||||
('imap_port', models.IntegerField(blank=True, null=True)),
|
||||
('imap_security', models.PositiveIntegerField(choices=[(1, 'No encryption'), (2, 'Use SSL'), (3, 'Use STARTTLS')], default=2)),
|
||||
('username', models.CharField(max_length=256)),
|
||||
('password', models.CharField(max_length=256)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='MailRule',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=256)),
|
||||
('folder', models.CharField(default='INBOX', max_length=256)),
|
||||
('filter_from', models.CharField(blank=True, max_length=256, null=True)),
|
||||
('filter_subject', models.CharField(blank=True, max_length=256, null=True)),
|
||||
('filter_body', models.CharField(blank=True, max_length=256, null=True)),
|
||||
('maximum_age', models.PositiveIntegerField(default=30)),
|
||||
('action', models.PositiveIntegerField(choices=[(1, 'Delete'), (2, 'Move to specified folder'), (3, "Mark as read, don't process read mails"), (4, "Flag the mail, don't process flagged mails")], default=3, help_text='The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched.')),
|
||||
('action_parameter', models.CharField(blank=True, help_text='Additional parameter for the action selected above, i.e., the target folder of the move to folder action.', max_length=256, null=True)),
|
||||
('assign_title_from', models.PositiveIntegerField(choices=[(1, 'Use subject as title'), (2, 'Use attachment filename as title')], default=1)),
|
||||
('assign_correspondent_from', models.PositiveIntegerField(choices=[(1, 'Do not assign a correspondent'), (2, 'Use mail address'), (3, 'Use name (or mail address if not available)'), (4, 'Use correspondent selected below')], default=1)),
|
||||
('account', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='rules', to='paperless_mail.mailaccount')),
|
||||
('assign_correspondent', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.correspondent')),
|
||||
('assign_document_type', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.documenttype')),
|
||||
('assign_tag', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='documents.tag')),
|
||||
],
|
||||
),
|
||||
]
|
32
src/paperless_mail/migrations/0002_auto_20201117_1334.py
Normal file
32
src/paperless_mail/migrations/0002_auto_20201117_1334.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-17 13:34
|
||||
|
||||
from django.db import migrations
|
||||
from django.db.migrations import RunPython
|
||||
from django_q.models import Schedule
|
||||
from django_q.tasks import schedule
|
||||
|
||||
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('paperless_mail.tasks.process_mail_accounts',
|
||||
name="Check all e-mail accounts",
|
||||
schedule_type=Schedule.MINUTES,
|
||||
minutes=10)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.filter(
|
||||
func='paperless_mail.tasks.process_mail_accounts').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('paperless_mail', '0001_initial'),
|
||||
('django_q', '0013_task_attempt_count'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
RunPython(add_schedules, remove_schedules)
|
||||
]
|
||||
|
||||
|
23
src/paperless_mail/migrations/0003_auto_20201118_1940.py
Normal file
23
src/paperless_mail/migrations/0003_auto_20201118_1940.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-18 19:40
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('paperless_mail', '0002_auto_20201117_1334'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='mailaccount',
|
||||
name='imap_port',
|
||||
field=models.IntegerField(blank=True, help_text='This is usually 143 for unencrypted and STARTTLS connections, and 993 for SSL connections.', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='mailrule',
|
||||
name='name',
|
||||
field=models.CharField(max_length=256, unique=True),
|
||||
),
|
||||
]
|
0
src/paperless_mail/migrations/__init__.py
Normal file
0
src/paperless_mail/migrations/__init__.py
Normal file
138
src/paperless_mail/models.py
Normal file
138
src/paperless_mail/models.py
Normal file
@@ -0,0 +1,138 @@
|
||||
from django.db import models
|
||||
|
||||
import documents.models as document_models
|
||||
|
||||
|
||||
class MailAccount(models.Model):
|
||||
|
||||
IMAP_SECURITY_NONE = 1
|
||||
IMAP_SECURITY_SSL = 2
|
||||
IMAP_SECURITY_STARTTLS = 3
|
||||
|
||||
IMAP_SECURITY_OPTIONS = (
|
||||
(IMAP_SECURITY_NONE, "No encryption"),
|
||||
(IMAP_SECURITY_SSL, "Use SSL"),
|
||||
(IMAP_SECURITY_STARTTLS, "Use STARTTLS"),
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=256, unique=True)
|
||||
|
||||
imap_server = models.CharField(max_length=256)
|
||||
|
||||
imap_port = models.IntegerField(
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="This is usually 143 for unencrypted and STARTTLS "
|
||||
"connections, and 993 for SSL connections.")
|
||||
|
||||
imap_security = models.PositiveIntegerField(
|
||||
choices=IMAP_SECURITY_OPTIONS,
|
||||
default=IMAP_SECURITY_SSL
|
||||
)
|
||||
|
||||
username = models.CharField(max_length=256)
|
||||
|
||||
password = models.CharField(max_length=256)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
class MailRule(models.Model):
|
||||
|
||||
ACTION_DELETE = 1
|
||||
ACTION_MOVE = 2
|
||||
ACTION_MARK_READ = 3
|
||||
ACTION_FLAG = 4
|
||||
|
||||
ACTIONS = (
|
||||
(ACTION_DELETE, "Delete"),
|
||||
(ACTION_MOVE, "Move to specified folder"),
|
||||
(ACTION_MARK_READ, "Mark as read, don't process read mails"),
|
||||
(ACTION_FLAG, "Flag the mail, don't process flagged mails")
|
||||
)
|
||||
|
||||
TITLE_FROM_SUBJECT = 1
|
||||
TITLE_FROM_FILENAME = 2
|
||||
|
||||
TITLE_SELECTOR = (
|
||||
(TITLE_FROM_SUBJECT, "Use subject as title"),
|
||||
(TITLE_FROM_FILENAME, "Use attachment filename as title")
|
||||
)
|
||||
|
||||
CORRESPONDENT_FROM_NOTHING = 1
|
||||
CORRESPONDENT_FROM_EMAIL = 2
|
||||
CORRESPONDENT_FROM_NAME = 3
|
||||
CORRESPONDENT_FROM_CUSTOM = 4
|
||||
|
||||
CORRESPONDENT_SELECTOR = (
|
||||
(CORRESPONDENT_FROM_NOTHING, "Do not assign a correspondent"),
|
||||
(CORRESPONDENT_FROM_EMAIL, "Use mail address"),
|
||||
(CORRESPONDENT_FROM_NAME, "Use name (or mail address if not available)"),
|
||||
(CORRESPONDENT_FROM_CUSTOM, "Use correspondent selected below")
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=256, unique=True)
|
||||
|
||||
account = models.ForeignKey(
|
||||
MailAccount,
|
||||
related_name="rules",
|
||||
on_delete=models.CASCADE
|
||||
)
|
||||
|
||||
folder = models.CharField(default='INBOX', max_length=256)
|
||||
|
||||
filter_from = models.CharField(max_length=256, null=True, blank=True)
|
||||
filter_subject = models.CharField(max_length=256, null=True, blank=True)
|
||||
filter_body = models.CharField(max_length=256, null=True, blank=True)
|
||||
|
||||
maximum_age = models.PositiveIntegerField(default=30)
|
||||
|
||||
action = models.PositiveIntegerField(
|
||||
choices=ACTIONS,
|
||||
default=ACTION_MARK_READ,
|
||||
help_text="The action applied to the mail. This action is only "
|
||||
"performed when documents were consumed from the mail. "
|
||||
"Mails without attachments will remain entirely "
|
||||
"untouched."
|
||||
)
|
||||
|
||||
action_parameter = models.CharField(
|
||||
max_length=256, blank=True, null=True,
|
||||
help_text="Additional parameter for the action selected above, i.e., "
|
||||
"the target folder of the move to folder action."
|
||||
)
|
||||
|
||||
assign_title_from = models.PositiveIntegerField(
|
||||
choices=TITLE_SELECTOR,
|
||||
default=TITLE_FROM_SUBJECT
|
||||
)
|
||||
|
||||
assign_tag = models.ForeignKey(
|
||||
document_models.Tag,
|
||||
null=True,
|
||||
blank=True,
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
assign_document_type = models.ForeignKey(
|
||||
document_models.DocumentType,
|
||||
null=True,
|
||||
blank=True,
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
assign_correspondent_from = models.PositiveIntegerField(
|
||||
choices=CORRESPONDENT_SELECTOR,
|
||||
default=CORRESPONDENT_FROM_NOTHING
|
||||
)
|
||||
|
||||
assign_correspondent = models.ForeignKey(
|
||||
document_models.Correspondent,
|
||||
null=True,
|
||||
blank=True,
|
||||
on_delete=models.SET_NULL
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
23
src/paperless_mail/tasks.py
Normal file
23
src/paperless_mail/tasks.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import logging
|
||||
|
||||
from paperless_mail.mail import MailAccountHandler
|
||||
from paperless_mail.models import MailAccount
|
||||
|
||||
|
||||
def process_mail_accounts():
|
||||
total_new_documents = 0
|
||||
for account in MailAccount.objects.all():
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(account)
|
||||
|
||||
if total_new_documents > 0:
|
||||
return f"Added {total_new_documents} document(s)."
|
||||
else:
|
||||
return "No new documents were added."
|
||||
|
||||
|
||||
def process_mail_account(name):
|
||||
account = MailAccount.objects.find(name=name)
|
||||
if account:
|
||||
MailAccountHandler().handle_mail_account(account)
|
||||
else:
|
||||
logging.error("Unknown mail acccount: {}".format(name))
|
0
src/paperless_mail/tests/__init__.py
Normal file
0
src/paperless_mail/tests/__init__.py
Normal file
360
src/paperless_mail/tests/test_mail.py
Normal file
360
src/paperless_mail/tests/test_mail.py
Normal file
@@ -0,0 +1,360 @@
|
||||
import uuid
|
||||
from collections import namedtuple
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
||||
|
||||
from documents.models import Correspondent
|
||||
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
|
||||
from paperless_mail.models import MailRule, MailAccount
|
||||
|
||||
|
||||
class BogusFolderManager:
|
||||
|
||||
current_folder = "INBOX"
|
||||
|
||||
def set(self, new_folder):
|
||||
if new_folder not in ["INBOX", "spam"]:
|
||||
raise MailboxFolderSelectError(None, "uhm")
|
||||
self.current_folder = new_folder
|
||||
|
||||
|
||||
class BogusMailBox(ContextManager):
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
def __init__(self):
|
||||
self.messages = []
|
||||
self.messages_spam = []
|
||||
|
||||
def login(self, username, password):
|
||||
if not (username == 'admin' and password == 'secret'):
|
||||
raise Exception()
|
||||
|
||||
folder = BogusFolderManager()
|
||||
|
||||
def fetch(self, criteria, mark_seen):
|
||||
msg = self.messages
|
||||
|
||||
criteria = str(criteria).strip('()').split(" ")
|
||||
|
||||
if 'UNSEEN' in criteria:
|
||||
msg = filter(lambda m: not m.seen, msg)
|
||||
|
||||
if 'SUBJECT' in criteria:
|
||||
subject = criteria[criteria.index('SUBJECT') + 1].strip('"')
|
||||
msg = filter(lambda m: subject in m.subject, msg)
|
||||
|
||||
if 'BODY' in criteria:
|
||||
body = criteria[criteria.index('BODY') + 1].strip('"')
|
||||
msg = filter(lambda m: body in m.body, msg)
|
||||
|
||||
if 'FROM' in criteria:
|
||||
from_ = criteria[criteria.index('FROM') + 1].strip('"')
|
||||
msg = filter(lambda m: from_ in m.from_, msg)
|
||||
|
||||
if 'UNFLAGGED' in criteria:
|
||||
msg = filter(lambda m: not m.flagged, msg)
|
||||
|
||||
return list(msg)
|
||||
|
||||
def seen(self, uid_list, seen_val):
|
||||
for message in self.messages:
|
||||
if message.uid in uid_list:
|
||||
message.seen = seen_val
|
||||
|
||||
def delete(self, uid_list):
|
||||
self.messages = list(filter(lambda m: m.uid not in uid_list, self.messages))
|
||||
|
||||
def flag(self, uid_list, flag_set, value):
|
||||
for message in self.messages:
|
||||
if message.uid in uid_list:
|
||||
for flag in flag_set:
|
||||
if flag == MailMessageFlags.FLAGGED:
|
||||
message.flagged = value
|
||||
|
||||
def move(self, uid_list, folder):
|
||||
if folder == "spam":
|
||||
self.messages_spam.append(
|
||||
filter(lambda m: m.uid in uid_list, self.messages)
|
||||
)
|
||||
self.messages = list(
|
||||
filter(lambda m: m.uid not in uid_list, self.messages)
|
||||
)
|
||||
else:
|
||||
raise Exception()
|
||||
|
||||
|
||||
def create_message(num_attachments=1, body="", subject="the suject", from_="noone@mail.com", seen=False, flagged=False):
|
||||
message = namedtuple('MailMessage', [])
|
||||
|
||||
message.uid = uuid.uuid4()
|
||||
message.subject = subject
|
||||
message.attachments = []
|
||||
message.from_ = from_
|
||||
message.body = body
|
||||
for i in range(num_attachments):
|
||||
attachment = namedtuple('Attachment', [])
|
||||
attachment.filename = 'some_file.pdf'
|
||||
attachment.content_type = 'application/pdf'
|
||||
attachment.payload = b'content of the attachment'
|
||||
message.attachments.append(attachment)
|
||||
|
||||
message.seen = seen
|
||||
message.flagged = flagged
|
||||
|
||||
return message
|
||||
|
||||
|
||||
class TestMail(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
patcher = mock.patch('paperless_mail.mail.MailBox')
|
||||
m = patcher.start()
|
||||
self.bogus_mailbox = BogusMailBox()
|
||||
m.return_value = self.bogus_mailbox
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
patcher = mock.patch('paperless_mail.mail.async_task')
|
||||
self.async_task = patcher.start()
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
self.mail_account_handler = MailAccountHandler()
|
||||
|
||||
def reset_bogus_mailbox(self):
|
||||
self.bogus_mailbox.messages = []
|
||||
self.bogus_mailbox.messages_spam = []
|
||||
self.bogus_mailbox.messages.append(create_message(subject="Invoice 1", from_="amazon@amazon.de", body="cables", seen=True, flagged=False))
|
||||
self.bogus_mailbox.messages.append(create_message(subject="Invoice 2", body="from my favorite electronic store", seen=False, flagged=True))
|
||||
self.bogus_mailbox.messages.append(create_message(subject="Claim your $10M price now!", from_="amazon@amazon-some-indian-site.org", seen=False))
|
||||
|
||||
def test_get_correspondent(self):
|
||||
message = namedtuple('MailMessage', [])
|
||||
message.from_ = "someone@somewhere.com"
|
||||
message.from_values = {'name': "Someone!", 'email': "someone@somewhere.com"}
|
||||
|
||||
message2 = namedtuple('MailMessage', [])
|
||||
message2.from_ = "me@localhost.com"
|
||||
message2.from_values = {'name': "", 'email': "fake@localhost.com"}
|
||||
|
||||
me_localhost = Correspondent.objects.create(name=message2.from_)
|
||||
someone_else = Correspondent.objects.create(name="someone else")
|
||||
|
||||
rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
|
||||
self.assertIsNone(get_correspondent(message, rule))
|
||||
|
||||
rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||
c = get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "someone@somewhere.com")
|
||||
c = get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "me@localhost.com")
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
|
||||
c = get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "Someone!")
|
||||
c = get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
|
||||
c = get_correspondent(message, rule)
|
||||
self.assertEqual(c, someone_else)
|
||||
|
||||
def test_get_title(self):
|
||||
message = namedtuple('MailMessage', [])
|
||||
message.subject = "the message title"
|
||||
att = namedtuple('Attachment', [])
|
||||
att.filename = "this_is_the_file.pdf"
|
||||
rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
|
||||
rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
|
||||
self.assertEqual(get_title(message, att, rule), "the message title")
|
||||
|
||||
def test_handle_message(self):
|
||||
message = namedtuple('MailMessage', [])
|
||||
message.subject = "the message title"
|
||||
message.from_ = "Myself"
|
||||
|
||||
att = namedtuple('Attachment', [])
|
||||
att.filename = "test1.pdf"
|
||||
att.content_type = 'application/pdf'
|
||||
att.payload = b"attachment contents"
|
||||
|
||||
att2 = namedtuple('Attachment', [])
|
||||
att2.filename = "test2.pdf"
|
||||
att2.content_type = 'application/pdf'
|
||||
att2.payload = b"attachment contents"
|
||||
|
||||
att3 = namedtuple('Attachment', [])
|
||||
att3.filename = "test3.pdf"
|
||||
att3.content_type = 'application/invalid'
|
||||
att3.payload = b"attachment contents"
|
||||
|
||||
message.attachments = [att, att2, att3]
|
||||
|
||||
account = MailAccount()
|
||||
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
|
||||
|
||||
result = self.mail_account_handler.handle_message(message, rule)
|
||||
|
||||
self.assertEqual(result, 2)
|
||||
|
||||
self.assertEqual(len(self.async_task.call_args_list), 2)
|
||||
|
||||
args1, kwargs1 = self.async_task.call_args_list[0]
|
||||
args2, kwargs2 = self.async_task.call_args_list[1]
|
||||
|
||||
self.assertEqual(kwargs1['override_title'], "test1")
|
||||
self.assertEqual(kwargs1['override_filename'], "test1.pdf")
|
||||
|
||||
self.assertEqual(kwargs2['override_title'], "test2")
|
||||
self.assertEqual(kwargs2['override_filename'], "test2.pdf")
|
||||
|
||||
@mock.patch("paperless_mail.mail.async_task")
|
||||
def test_handle_empty_message(self, m):
|
||||
message = namedtuple('MailMessage', [])
|
||||
|
||||
message.attachments = []
|
||||
rule = MailRule()
|
||||
|
||||
result = self.mail_account_handler.handle_message(message, rule)
|
||||
|
||||
self.assertFalse(m.called)
|
||||
self.assertEqual(result, 0)
|
||||
|
||||
def test_handle_mail_account_mark_read(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
|
||||
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
|
||||
def test_handle_mail_account_delete(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_DELETE, filter_subject="Invoice")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||
|
||||
def test_handle_mail_account_flag(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
|
||||
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
|
||||
def test_handle_mail_account_move(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", filter_subject="Claim")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
def test_errors(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue(str(e).startswith("Error while authenticating account"))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("uuuh does not exist" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||
|
||||
rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("Error while processing post-consume actions" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def test_filters(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule3", account=account, action=MailRule.ACTION_DELETE, filter_subject="Claim")
|
||||
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
rule.filter_subject = None
|
||||
rule.filter_body = "electronic"
|
||||
rule.save()
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
rule.filter_from = "amazon"
|
||||
rule.filter_body = None
|
||||
rule.save()
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||
self.assertEqual(self.async_task.call_count, 4)
|
||||
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
rule.filter_from = "amazon"
|
||||
rule.filter_body = "cables"
|
||||
rule.filter_subject = "Invoice"
|
||||
rule.save()
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 5)
|
@@ -1,5 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_tesseract.signals import tesseract_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
@@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
document_consumer_declaration.connect(tesseract_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@@ -2,18 +2,17 @@ import itertools
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import langdetect
|
||||
import pdftotext
|
||||
import pyocr
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from pyocr import PyocrException
|
||||
|
||||
import pdftotext
|
||||
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
|
||||
run_convert
|
||||
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
@@ -45,8 +44,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
input="{}[0]".format(self.document_path),
|
||||
output=out_path,
|
||||
input_file="{}[0]".format(self.document_path),
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
except ParseError:
|
||||
# if convert fails, fall back to extracting
|
||||
@@ -66,8 +65,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
input=gs_out_path,
|
||||
output=out_path,
|
||||
input_file=gs_out_path,
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
return out_path
|
||||
@@ -87,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return self._text
|
||||
|
||||
if not settings.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("info", "Skipping OCR, using Text from PDF")
|
||||
self.log("debug", "Skipping OCR, using Text from PDF")
|
||||
self._text = get_text_from_pdf(self.document_path)
|
||||
return self._text
|
||||
|
||||
@@ -100,7 +99,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
try:
|
||||
|
||||
sample_page_index = int(len(images) / 2)
|
||||
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images)))
|
||||
self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
|
||||
self.progress_callback(0.4, 1, "Language Detection.")
|
||||
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
|
||||
guessed_language = self._guess_language(sample_page_text)
|
||||
@@ -111,7 +110,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||
|
||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||
self.log("info", "Detected language: {} (default language)".format(guessed_language))
|
||||
self.log("debug", "Detected language: {} (default language)".format(guessed_language))
|
||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
||||
@@ -119,10 +118,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||
|
||||
else:
|
||||
self.log("info", "Detected language: {}".format(guessed_language))
|
||||
self.log("debug", "Detected language: {}".format(guessed_language))
|
||||
ocr_pages = self._ocr(images, ISO639[guessed_language], report_progress=True)
|
||||
|
||||
self.log("info", "OCR completed.")
|
||||
self.log("debug", "OCR completed.")
|
||||
self._text = strip_excess_whitespace(" ".join(ocr_pages))
|
||||
return self._text
|
||||
|
||||
@@ -134,7 +133,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
self.log("info", "Converting document {} into greyscale images...".format(self.document_path))
|
||||
self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
@@ -142,8 +141,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
run_convert(density=settings.CONVERT_DENSITY,
|
||||
depth="8",
|
||||
type="grayscale",
|
||||
input=self.document_path,
|
||||
output=pnm,
|
||||
input_file=self.document_path,
|
||||
output_file=pnm,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
# Get a list of converted images
|
||||
@@ -152,12 +151,12 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
|
||||
self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
|
||||
|
||||
self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms)))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with Pool(processes=settings.OCR_THREADS) as pool:
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
pnms = pool.map(run_unpaper, pnms)
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
@@ -167,13 +166,13 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
guess = langdetect.detect(text)
|
||||
return guess
|
||||
except Exception as e:
|
||||
self.log('debug', "Language detection failed with: {}".format(e))
|
||||
self.log('warning', "Language detection failed with: {}".format(e))
|
||||
return None
|
||||
|
||||
def _ocr(self, imgs, lang, report_progress=False):
|
||||
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
||||
self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
||||
r = []
|
||||
with Pool(processes=settings.OCR_THREADS) as pool:
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
# r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
for i, page in enumerate(pool.imap(image_to_string, itertools.product(imgs, [lang]))):
|
||||
if report_progress:
|
||||
@@ -191,7 +190,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
images_copy = list(images)
|
||||
del images_copy[sample_page_index]
|
||||
if images_copy:
|
||||
self.log('info', 'Continuing ocr with default language.')
|
||||
self.log('debug', 'Continuing ocr with default language.')
|
||||
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE, report_progress=True)
|
||||
ocr_pages.insert(sample_page_index, sample_page)
|
||||
return ocr_pages
|
||||
|
@@ -3,21 +3,16 @@ import re
|
||||
from .parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration:
|
||||
def tesseract_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0,
|
||||
"test": tesseract_consumer_test
|
||||
}
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc.lower()):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0
|
||||
}
|
||||
|
||||
return None
|
||||
def tesseract_consumer_test(doc):
|
||||
return MATCHING_FILES.match(doc.lower())
|
||||
|
BIN
src/paperless_tesseract/tests/samples/simple.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/simple.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/simple.png
Normal file
BIN
src/paperless_tesseract/tests/samples/simple.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
@@ -5,10 +5,10 @@ from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..parsers import RasterisedDocumentParser
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
221
src/paperless_tesseract/tests/test_parser.py
Normal file
221
src/paperless_tesseract/tests/test_parser.py
Normal file
@@ -0,0 +1,221 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
from pyocr.error import TesseractError
|
||||
|
||||
from documents.parsers import ParseError, run_convert
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise TesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def get_available_languages():
|
||||
return ['eng', 'deu']
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
image_to_string_calls.append((file_handle.name, lang))
|
||||
return file_handle.read()
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
def fake_convert(input_file, output_file, **kwargs):
|
||||
with open(input_file) as f:
|
||||
lines = f.readlines()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
with open(output_file % i, "w") as f2:
|
||||
f2.write(line.strip())
|
||||
|
||||
|
||||
def fake_unpaper(pnm):
|
||||
output = pnm + ".unpaper.pnm"
|
||||
shutil.copy(pnm, output)
|
||||
return output
|
||||
|
||||
|
||||
class FakeImageFile(ContextManager):
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return os.path.basename(self.fname)
|
||||
|
||||
|
||||
fake_image = FakeImageFile
|
||||
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
|
||||
@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
|
||||
@mock.patch("paperless_tesseract.parsers.Image.open", open)
|
||||
class TestRasterisedDocumentParser(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
|
||||
global image_to_string_calls
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
def get_input_file(self, pages):
|
||||
_, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
|
||||
with open(fname, "w") as f:
|
||||
f.writelines([f"line {p}\n" for p in range(pages)])
|
||||
return fname
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_simple_language_match(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_2_pages(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_3_pages(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
|
||||
def test_parse_text_lang_detect_failed(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
|
||||
def test_parse_text_lang_not_installed(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2 line 3")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
|
||||
def test_parse_text_lang_mismatch(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
|
||||
def test_parse_empty_doc(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
|
||||
try:
|
||||
parser.get_text()
|
||||
except ParseError as e:
|
||||
self.assertEqual("Empty document, nothing to do.", str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
|
||||
class TestAuxilliaryFunctions(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
|
||||
override_settings(SCRATCH_DIR=self.scratch).enable()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
|
||||
def test_get_text_from_pdf(self):
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
|
||||
|
||||
self.assertEqual(text.strip(), "This is a test document.")
|
||||
|
||||
def test_get_text_from_pdf_error(self):
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
|
||||
|
||||
self.assertEqual(text.strip(), "")
|
||||
|
||||
def test_image_to_string(self):
|
||||
text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
|
||||
|
||||
self.assertEqual(text, "This is a test document.")
|
||||
|
||||
def test_image_to_string_language_unavailable(self):
|
||||
try:
|
||||
image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
|
||||
except OCRError as e:
|
||||
self.assertTrue("Failed loading language" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
@override_settings(OCR_ALWAYS=False)
|
||||
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
|
||||
def test_is_ocred(self, m2, m):
|
||||
parser = RasterisedDocumentParser("", uuid.uuid4())
|
||||
m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
|
||||
"lots of text lots of text lots of text lots of text lots of text lots of text " \
|
||||
"lots of text lots of text lots of text lots of text lots of text lots of text "
|
||||
parser.get_text()
|
||||
self.assertEqual(m.call_count, 2)
|
||||
self.assertEqual(m2.call_count, 0)
|
||||
|
||||
def test_thumbnail(self):
|
||||
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
|
||||
parser.get_thumbnail()
|
||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.run_convert")
|
||||
def test_thumbnail_fallback(self, m):
|
||||
|
||||
def call_convert(input_file, output_file, **kwargs):
|
||||
if ".pdf" in input_file:
|
||||
raise ParseError("Does not compute.")
|
||||
else:
|
||||
run_convert(input_file=input_file, output_file=output_file, **kwargs)
|
||||
|
||||
m.side_effect = call_convert
|
||||
|
||||
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
|
||||
parser.get_thumbnail()
|
||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
@@ -1,6 +1,6 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from ..signals import ConsumerDeclaration
|
||||
from paperless_tesseract.signals import tesseract_consumer_test
|
||||
|
||||
|
||||
class SignalsTestCase(TestCase):
|
||||
@@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
|
||||
for prefix in prefixes:
|
||||
for suffix in suffixes:
|
||||
name = "{}.{}".format(prefix, suffix)
|
||||
self.assertTrue(ConsumerDeclaration.test(name))
|
||||
self.assertTrue(tesseract_consumer_test(name))
|
||||
|
||||
def test_test_handles_various_file_names_false(self):
|
||||
|
||||
@@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
|
||||
for prefix in prefixes:
|
||||
for suffix in suffixes:
|
||||
name = "{}.{}".format(prefix, suffix)
|
||||
self.assertFalse(ConsumerDeclaration.test(name))
|
||||
self.assertFalse(tesseract_consumer_test(name))
|
||||
|
||||
self.assertFalse(ConsumerDeclaration.test(""))
|
||||
self.assertFalse(ConsumerDeclaration.test("doc"))
|
||||
self.assertFalse(tesseract_consumer_test(""))
|
||||
self.assertFalse(tesseract_consumer_test("doc"))
|
||||
|
@@ -1,5 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_text.signals import text_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessTextConfig(AppConfig):
|
||||
|
||||
@@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
|
||||
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
from .signals import ConsumerDeclaration
|
||||
|
||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
||||
document_consumer_declaration.connect(text_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
|
@@ -47,8 +47,8 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
def read_text():
|
||||
with open(self.document_path, 'r') as src:
|
||||
lines = [l.strip() for l in src.readlines()]
|
||||
text = "\n".join([l for l in lines[:n_lines]])
|
||||
lines = [line.strip() for line in src.readlines()]
|
||||
text = "\n".join([line for line in lines[:n_lines]])
|
||||
return text.replace('"', "'")
|
||||
|
||||
def create_txlayer():
|
||||
|
@@ -3,21 +3,16 @@ import re
|
||||
from .parsers import TextDocumentParser
|
||||
|
||||
|
||||
class ConsumerDeclaration:
|
||||
def text_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10,
|
||||
"test": text_consumer_test
|
||||
}
|
||||
|
||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||
|
||||
@classmethod
|
||||
def handle(cls, sender, **kwargs):
|
||||
return cls.test
|
||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||
|
||||
@classmethod
|
||||
def test(cls, doc):
|
||||
|
||||
if cls.MATCHING_FILES.match(doc.lower()):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10
|
||||
}
|
||||
|
||||
return None
|
||||
def text_consumer_test(doc):
|
||||
return MATCHING_FILES.match(doc.lower())
|
||||
|
@@ -1,12 +1,11 @@
|
||||
[pycodestyle]
|
||||
exclude = migrations, paperless/settings.py, .tox
|
||||
|
||||
ignore = E501
|
||||
|
||||
[tool:pytest]
|
||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||
addopts = --pythonwarnings=all -n auto
|
||||
addopts = --pythonwarnings=all
|
||||
env =
|
||||
PAPERLESS_PASSPHRASE=THISISNOTASECRET
|
||||
PAPERLESS_SECRET=paperless
|
||||
PAPERLESS_EMAIL_SECRET=paperless
|
||||
|
||||
@@ -15,4 +14,4 @@ env =
|
||||
source =
|
||||
./
|
||||
omit =
|
||||
*/tests
|
||||
*/tests/*
|
||||
|
Reference in New Issue
Block a user