mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into celery-tasks
This commit is contained in:
@@ -50,17 +50,17 @@ class DocumentTypeAdmin(admin.ModelAdmin):
|
||||
class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
search_fields = ("correspondent__name", "title", "content", "tags__name")
|
||||
readonly_fields = ("added", "file_type", "storage_type", "filename")
|
||||
readonly_fields = ("added", "mime_type", "storage_type", "filename")
|
||||
|
||||
list_display_links = ("title",)
|
||||
|
||||
list_display = (
|
||||
"title",
|
||||
"created",
|
||||
"added",
|
||||
"correspondent",
|
||||
"title",
|
||||
"tags_",
|
||||
"archive_serial_number",
|
||||
"document_type",
|
||||
"filename"
|
||||
"created",
|
||||
)
|
||||
|
||||
list_filter = (
|
||||
"document_type",
|
||||
"tags",
|
||||
@@ -118,9 +118,19 @@ class DocumentAdmin(admin.ModelAdmin):
|
||||
|
||||
class LogAdmin(admin.ModelAdmin):
|
||||
|
||||
def has_add_permission(self, request):
|
||||
return False
|
||||
|
||||
def has_change_permission(self, request, obj=None):
|
||||
return False
|
||||
|
||||
list_display = ("created", "message", "level",)
|
||||
list_filter = ("level", "created",)
|
||||
|
||||
ordering = ('-created',)
|
||||
|
||||
list_display_links = ("created", "message")
|
||||
|
||||
|
||||
admin.site.register(Correspondent, CorrespondentAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
|
@@ -30,10 +30,12 @@ class DocumentClassifier(object):
|
||||
FORMAT_VERSION = 5
|
||||
|
||||
def __init__(self):
|
||||
# mtime of the model file on disk. used to prevent reloading when nothing has changed.
|
||||
# mtime of the model file on disk. used to prevent reloading when
|
||||
# nothing has changed.
|
||||
self.classifier_version = 0
|
||||
|
||||
# hash of the training data. used to prevent re-training when the training data has not changed.
|
||||
# hash of the training data. used to prevent re-training when the
|
||||
# training data has not changed.
|
||||
self.data_hash = None
|
||||
|
||||
self.data_vectorizer = None
|
||||
@@ -48,10 +50,12 @@ class DocumentClassifier(object):
|
||||
schema_version = pickle.load(f)
|
||||
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.")
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannor load classifier, incompatible versions.")
|
||||
else:
|
||||
if self.classifier_version > 0:
|
||||
logger.info("Classifier updated on disk, reloading classifier models")
|
||||
logger.info("Classifier updated on disk, "
|
||||
"reloading classifier models")
|
||||
self.data_hash = pickle.load(f)
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
@@ -82,20 +86,22 @@ class DocumentClassifier(object):
|
||||
# Step 1: Extract and preprocess training data from the database.
|
||||
logging.getLogger(__name__).debug("Gathering data from database...")
|
||||
m = hashlib.sha1()
|
||||
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):
|
||||
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501
|
||||
preprocessed_content = preprocess_content(doc.content)
|
||||
m.update(preprocessed_content.encode('utf-8'))
|
||||
data.append(preprocessed_content)
|
||||
|
||||
y = -1
|
||||
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.pk
|
||||
dt = doc.document_type
|
||||
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = dt.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_document_type.append(y)
|
||||
|
||||
y = -1
|
||||
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.pk
|
||||
cor = doc.correspondent
|
||||
if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = cor.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_correspondent.append(y)
|
||||
|
||||
@@ -145,7 +151,7 @@ class DocumentClassifier(object):
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logging.getLogger(__name__).debug("Training tags classifier...")
|
||||
self.tags_classifier = MLPClassifier(verbose=True, tol=0.01)
|
||||
self.tags_classifier = MLPClassifier(tol=0.01)
|
||||
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
|
||||
else:
|
||||
self.tags_classifier = None
|
||||
@@ -157,7 +163,7 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).debug(
|
||||
"Training correspondent classifier..."
|
||||
)
|
||||
self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01)
|
||||
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
||||
self.correspondent_classifier.fit(
|
||||
data_vectorized,
|
||||
labels_correspondent
|
||||
@@ -173,7 +179,7 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).debug(
|
||||
"Training document type classifier..."
|
||||
)
|
||||
self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01)
|
||||
self.document_type_classifier = MLPClassifier(tol=0.01)
|
||||
self.document_type_classifier.fit(
|
||||
data_vectorized,
|
||||
labels_document_type
|
||||
|
@@ -2,8 +2,8 @@ import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import magic
|
||||
from asgiref.sync import async_to_sync
|
||||
from channels.layers import get_channel_layer
|
||||
from django.conf import settings
|
||||
@@ -15,7 +15,7 @@ from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@@ -69,12 +69,6 @@ class Consumer(LoggingMixin):
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def pre_check_regex(self):
|
||||
if not re.match(FileInfo.REGEXES["title"], self.filename):
|
||||
raise ConsumerError(
|
||||
"Filename {} does not seem to be safe to "
|
||||
"consume".format(self.filename))
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
@@ -118,18 +112,21 @@ class Consumer(LoggingMixin):
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_regex()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
self.log("info", "Consuming {}".format(self.filename))
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
parser_class = get_parser_class(self.filename)
|
||||
mime_type = magic.from_file(self.path, mime=True)
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if not parser_class:
|
||||
raise ConsumerError("No parsers abvailable for {}".format(self.filename))
|
||||
raise ConsumerError(f"No parsers abvailable for {self.filename}")
|
||||
else:
|
||||
self.log("debug", "Parser: {}".format(parser_class.__name__))
|
||||
self.log("debug",
|
||||
f"Parser: {parser_class.__name__} "
|
||||
f"based on mime type {mime_type}")
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
@@ -156,7 +153,7 @@ class Consumer(LoggingMixin):
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("debug", "Generating thumbnail for {}...".format(self.filename))
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(self.filename, 10, 100, 'WORKING',
|
||||
'Generating thumbnail...')
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
@@ -196,7 +193,8 @@ class Consumer(LoggingMixin):
|
||||
# store the document.
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date
|
||||
date=date,
|
||||
mime_type=mime_type
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
@@ -239,11 +237,11 @@ class Consumer(LoggingMixin):
|
||||
|
||||
return document
|
||||
|
||||
def _store(self, text, date):
|
||||
def _store(self, text, date, mime_type):
|
||||
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
|
||||
file_info = FileInfo.from_path(self.filename)
|
||||
file_info = FileInfo.from_filename(self.filename)
|
||||
|
||||
stats = os.stat(self.path)
|
||||
|
||||
@@ -262,7 +260,7 @@ class Consumer(LoggingMixin):
|
||||
correspondent=file_info.correspondent,
|
||||
title=file_info.title,
|
||||
content=text,
|
||||
file_type=file_info.extension,
|
||||
mime_type=mime_type,
|
||||
checksum=hashlib.md5(f.read()).hexdigest(),
|
||||
created=created,
|
||||
modified=created,
|
||||
@@ -290,10 +288,12 @@ class Consumer(LoggingMixin):
|
||||
document.title = self.override_title
|
||||
|
||||
if self.override_correspondent_id:
|
||||
document.correspondent = Correspondent.objects.get(pk=self.override_correspondent_id)
|
||||
document.correspondent = Correspondent.objects.get(
|
||||
pk=self.override_correspondent_id)
|
||||
|
||||
if self.override_document_type_id:
|
||||
document.document_type = DocumentType.objects.get(pk=self.override_document_type_id)
|
||||
document.document_type = DocumentType.objects.get(
|
||||
pk=self.override_document_type_id)
|
||||
|
||||
if self.override_tag_ids:
|
||||
for tag_id in self.override_tag_ids:
|
||||
|
@@ -65,38 +65,39 @@ def many_to_dictionary(field):
|
||||
return mydictionary
|
||||
|
||||
|
||||
def generate_filename(document):
|
||||
# Create filename based on configured format
|
||||
def generate_filename(doc):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(document.tags))
|
||||
many_to_dictionary(doc.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(document.correspondent),
|
||||
title=slugify(document.title),
|
||||
created=slugify(document.created),
|
||||
created_year=document.created.year if document.created else "none",
|
||||
created_month=document.created.month if document.created else "none",
|
||||
created_day=document.created.day if document.created else "none",
|
||||
added=slugify(document.added),
|
||||
added_year=document.added.year if document.added else "none",
|
||||
added_month=document.added.month if document.added else "none",
|
||||
added_day=document.added.day if document.added else "none",
|
||||
correspondent=slugify(doc.correspondent),
|
||||
title=slugify(doc.title),
|
||||
created=slugify(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=doc.created.month if doc.created else "none",
|
||||
created_day=doc.created.day if doc.created else "none",
|
||||
added=slugify(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=doc.added.month if doc.added else "none",
|
||||
added_day=doc.added.day if doc.added else "none",
|
||||
tags=tags,
|
||||
)
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logging.getLogger(__name__).warning("Invalid PAPERLESS_FILENAME_FORMAT: {}, falling back to default,".format(settings.PAPERLESS_FILENAME_FORMAT))
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i.%s" % (path, document.pk, document.file_type)
|
||||
filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
|
||||
else:
|
||||
filename = "%07i.%s" % (document.pk, document.file_type)
|
||||
filename = "%07i%s" % (doc.pk, doc.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if document.storage_type == document.STORAGE_TYPE_GPG:
|
||||
if doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
@@ -3,22 +3,35 @@ import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
import magic
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
from documents.parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
document = forms.FileField()
|
||||
|
||||
def clean_document(self):
|
||||
document_name = self.cleaned_data.get("document").name
|
||||
|
||||
try:
|
||||
validate_filename(self.cleaned_data.get("document").name)
|
||||
validate_filename(document_name)
|
||||
except ValidationError:
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
return self.cleaned_data.get("document")
|
||||
|
||||
document_data = self.cleaned_data.get("document").read()
|
||||
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise forms.ValidationError("This mime type is not supported.")
|
||||
|
||||
return document_name, document_data
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
@@ -27,17 +40,20 @@ class UploadForm(forms.Form):
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
document = self.cleaned_data.get("document").read()
|
||||
original_filename = self.cleaned_data.get("document").name
|
||||
original_filename, data = self.cleaned_data.get("document")
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
# TODO: dont just append pdf. This is here for taht weird regex check at the start of the consumer.
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-", suffix=".pdf", dir=settings.SCRATCH_DIR, delete=False) as f:
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
|
||||
f.write(document)
|
||||
f.write(data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file", f.name, override_filename=original_filename, task_name=os.path.basename(original_filename))
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=original_filename,
|
||||
task_name=os.path.basename(original_filename)[:100])
|
||||
|
@@ -1,6 +1,8 @@
|
||||
import logging
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.conf import settings
|
||||
from whoosh import highlight
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
@@ -8,7 +10,6 @@ from whoosh.index import create_in, exists_in, open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from paperless import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -69,6 +70,8 @@ def open_index(recreate=False):
|
||||
# TODO: this is not thread safe. If 2 instances try to create the index
|
||||
# at the same time, this fails. This currently prevents parallel
|
||||
# tests.
|
||||
if not os.path.isdir(settings.INDEX_DIR):
|
||||
os.makedirs(settings.INDEX_DIR, exist_ok=True)
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@@ -117,6 +120,7 @@ def query_page(ix, query, page):
|
||||
def autocomplete(ix, term, limit=10):
|
||||
with ix.reader() as reader:
|
||||
terms = []
|
||||
for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
|
||||
for (score, t) in reader.most_distinctive_terms(
|
||||
"content", number=limit, prefix=term.lower()):
|
||||
terms.append(t)
|
||||
return terms
|
||||
|
@@ -19,10 +19,13 @@ class Handler(FileSystemEventHandler):
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
async_task("documents.tasks.consume_file", file, task_name=os.path.basename(file))
|
||||
async_task("documents.tasks.consume_file",
|
||||
file,
|
||||
task_name=os.path.basename(file)[:100])
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error("Error while consuming document: {}".format(e))
|
||||
logging.getLogger(__name__).error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
|
||||
def on_created(self, event):
|
||||
self._consume(event.src_path)
|
||||
@@ -66,12 +69,14 @@ class Command(BaseCommand):
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
async_task("documents.tasks.consume_file", entry.path, task_name=os.path.basename(entry.path))
|
||||
async_task("documents.tasks.consume_file",
|
||||
entry.path,
|
||||
task_name=os.path.basename(entry.path)[:100])
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info('Using polling instead of file'
|
||||
'system notifications.')
|
||||
logging.getLogger(__name__).info(
|
||||
"Using polling instead of file system notifications.")
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
else:
|
||||
observer = Observer()
|
||||
|
@@ -63,7 +63,7 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = "{:07}_{}".format(document.pk, document.file_name)
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
|
||||
@@ -73,7 +73,7 @@ class Command(Renderable, BaseCommand):
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
print("Exporting: {}".format(file_target))
|
||||
print(f"Exporting: {file_target}")
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
@@ -127,8 +127,8 @@ class Command(Renderable, BaseCommand):
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
|
||||
if tags:
|
||||
return "{} - {} - {} - {}.{}".format(
|
||||
return "{} - {} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, tags, doc.file_type)
|
||||
|
||||
return "{} - {} - {}.{}".format(
|
||||
return "{} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, doc.file_type)
|
||||
|
@@ -120,7 +120,7 @@ class Command(Renderable, BaseCommand):
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
else:
|
||||
print("Moving {} to {}".format(document_path, document.source_path))
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
|
||||
|
@@ -74,13 +74,13 @@ class Command(Renderable, BaseCommand):
|
||||
try:
|
||||
classifier.reload()
|
||||
except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
|
||||
logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Cannot classify documents: {e}.")
|
||||
classifier = None
|
||||
|
||||
for document in documents:
|
||||
logging.getLogger(__name__).info(
|
||||
"Processing document {}".format(document.title)
|
||||
)
|
||||
f"Processing document {document.title}")
|
||||
|
||||
if options['correspondent']:
|
||||
set_correspondent(
|
||||
|
@@ -6,25 +6,42 @@ from documents.models import MatchingModel, Correspondent, DocumentType, Tag
|
||||
|
||||
|
||||
def match_correspondents(document_content, classifier):
|
||||
correspondents = Correspondent.objects.all()
|
||||
predicted_correspondent_id = classifier.predict_correspondent(document_content) if classifier else None
|
||||
if classifier:
|
||||
pred_id = classifier.predict_correspondent(document_content)
|
||||
else:
|
||||
pred_id = None
|
||||
|
||||
return [o for o in correspondents if matches(o, document_content) or o.pk == predicted_correspondent_id]
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
||||
correspondents))
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
document_types = DocumentType.objects.all()
|
||||
predicted_document_type_id = classifier.predict_document_type(document_content) if classifier else None
|
||||
if classifier:
|
||||
pred_id = classifier.predict_document_type(document_content)
|
||||
else:
|
||||
pred_id = None
|
||||
|
||||
return [o for o in document_types if matches(o, document_content) or o.pk == predicted_document_type_id]
|
||||
document_types = DocumentType.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
||||
document_types))
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
objects = Tag.objects.all()
|
||||
predicted_tag_ids = classifier.predict_tags(document_content) if classifier else []
|
||||
if classifier:
|
||||
predicted_tag_ids = classifier.predict_tags(document_content)
|
||||
else:
|
||||
predicted_tag_ids = []
|
||||
|
||||
matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids]
|
||||
return matched_tags
|
||||
tags = Tag.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
|
||||
tags))
|
||||
|
||||
|
||||
def matches(matching_model, document_content):
|
||||
@@ -42,39 +59,45 @@ def matches(matching_model, document_content):
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
||||
for word in _split_match(matching_model):
|
||||
search_result = re.search(
|
||||
r"\b{}\b".format(word), document_content, **search_kwargs)
|
||||
rf"\b{word}\b", document_content, **search_kwargs)
|
||||
if not search_result:
|
||||
return False
|
||||
return True
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
for word in _split_match(matching_model):
|
||||
if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
|
||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
||||
return True
|
||||
return False
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
return bool(re.search(
|
||||
r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
|
||||
rf"\b{matching_model.match}\b",
|
||||
document_content,
|
||||
**search_kwargs
|
||||
))
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
return bool(re.search(
|
||||
re.compile(matching_model.match, **search_kwargs), document_content))
|
||||
re.compile(matching_model.match, **search_kwargs),
|
||||
document_content
|
||||
))
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
match = re.sub(r'[^\w\s]', '', matching_model.match)
|
||||
text = re.sub(r'[^\w\s]', '', document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
|
||||
return True if fuzz.partial_ratio(match, text) >= 90 else False
|
||||
return fuzz.partial_ratio(match, text) >= 90
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
# this is done elsewhere.
|
||||
return False
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
else:
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
||||
def _split_match(matching_model):
|
||||
|
@@ -1,4 +1,6 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-07 12:35
|
||||
import uuid
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
@@ -20,6 +22,14 @@ def make_index(apps, schema_editor):
|
||||
print(" --> Cannot create document index.")
|
||||
|
||||
|
||||
def logs_set_default_group(apps, schema_editor):
|
||||
Log = apps.get_model('documents', 'Log')
|
||||
for log in Log.objects.all():
|
||||
if log.group is None:
|
||||
log.group = uuid.uuid4()
|
||||
log.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
@@ -85,6 +95,10 @@ class Migration(migrations.Migration):
|
||||
name='group',
|
||||
field=models.UUIDField(blank=True, null=True),
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=django.db.migrations.operations.special.RunPython.noop,
|
||||
reverse_code=logs_set_default_group
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=make_index,
|
||||
reverse_code=django.db.migrations.operations.special.RunPython.noop,
|
||||
|
77
src/documents/migrations/1003_mime_types.py
Normal file
77
src/documents/migrations/1003_mime_types.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-20 11:21
|
||||
import mimetypes
|
||||
import os
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
def source_path(self):
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def add_mime_types(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
|
||||
for d in documents:
|
||||
d.mime_type = magic.from_file(source_path(d), mime=True)
|
||||
d.save()
|
||||
|
||||
|
||||
def add_file_extensions(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
|
||||
for d in documents:
|
||||
d.file_type = os.path.splitext(d.filename)[1].strip('.')
|
||||
d.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1002_auto_20201111_1105'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='mime_type',
|
||||
field=models.CharField(default="-", editable=False, max_length=256),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.RunPython(add_mime_types, migrations.RunPython.noop),
|
||||
|
||||
# This operation is here so that we can revert the entire migration:
|
||||
# By allowing this field to be blank and null, we can revert the
|
||||
# remove operation further down and the database won't complain about
|
||||
# NOT NULL violations.
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='file_type',
|
||||
field=models.CharField(
|
||||
choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')],
|
||||
editable=False,
|
||||
max_length=4,
|
||||
null=True,
|
||||
blank=True
|
||||
),
|
||||
),
|
||||
migrations.RunPython(migrations.RunPython.noop, add_file_extensions),
|
||||
migrations.RemoveField(
|
||||
model_name='document',
|
||||
name='file_type',
|
||||
),
|
||||
]
|
@@ -1,6 +1,7 @@
|
||||
# coding=utf-8
|
||||
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
@@ -113,18 +114,6 @@ class DocumentType(MatchingModel):
|
||||
|
||||
class Document(models.Model):
|
||||
|
||||
# TODO: why do we need an explicit list
|
||||
TYPE_PDF = "pdf"
|
||||
TYPE_PNG = "png"
|
||||
TYPE_JPG = "jpg"
|
||||
TYPE_GIF = "gif"
|
||||
TYPE_TIF = "tiff"
|
||||
TYPE_TXT = "txt"
|
||||
TYPE_CSV = "csv"
|
||||
TYPE_MD = "md"
|
||||
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,
|
||||
TYPE_TXT, TYPE_CSV, TYPE_MD)
|
||||
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
STORAGE_TYPES = (
|
||||
@@ -156,10 +145,9 @@ class Document(models.Model):
|
||||
"primarily used for searching."
|
||||
)
|
||||
|
||||
file_type = models.CharField(
|
||||
max_length=4,
|
||||
editable=False,
|
||||
choices=tuple([(t, t.upper()) for t in TYPES])
|
||||
mime_type = models.CharField(
|
||||
max_length=256,
|
||||
editable=False
|
||||
)
|
||||
|
||||
tags = models.ManyToManyField(
|
||||
@@ -223,7 +211,7 @@ class Document(models.Model):
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
else:
|
||||
fname = "{:07}.{}".format(self.pk, self.file_type)
|
||||
fname = "{:07}{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
|
||||
@@ -238,7 +226,11 @@ class Document(models.Model):
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + "." + self.file_type
|
||||
return slugify(str(self)) + self.file_type
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
return mimetypes.guess_extension(str(self.mime_type))
|
||||
|
||||
@property
|
||||
def thumbnail_path(self):
|
||||
@@ -278,6 +270,7 @@ class Log(models.Model):
|
||||
return self.message
|
||||
|
||||
|
||||
# TODO: why is this in the models file?
|
||||
class FileInfo:
|
||||
|
||||
# This epic regex *almost* worked for our needs, so I'm keeping it here for
|
||||
@@ -292,53 +285,44 @@ class FileInfo:
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
# TODO: what is this used for
|
||||
formats = "pdf|jpe?g|png|gif|tiff?|te?xt|md|csv"
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-correspondent-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title-tags", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)?"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<title>.*)?$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("title", re.compile(
|
||||
r"(?P<title>.*)"
|
||||
r"\.(?P<extension>{})$".format(formats),
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
))
|
||||
])
|
||||
@@ -381,15 +365,6 @@ class FileInfo:
|
||||
)[0])
|
||||
return tuple(r)
|
||||
|
||||
@classmethod
|
||||
def _get_extension(cls, extension):
|
||||
r = extension.lower()
|
||||
if r == "jpeg":
|
||||
return "jpg"
|
||||
if r == "tif":
|
||||
return "tiff"
|
||||
return r
|
||||
|
||||
@classmethod
|
||||
def _mangle_property(cls, properties, name):
|
||||
if name in properties:
|
||||
@@ -398,18 +373,16 @@ class FileInfo:
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_path(cls, path):
|
||||
def from_filename(cls, filename):
|
||||
"""
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<date> - <correspondent> - <title> - <tags>.<suffix>"
|
||||
"<correspondent> - <title> - <tags>.<suffix>"
|
||||
"<correspondent> - <title>.<suffix>"
|
||||
"<title>.<suffix>"
|
||||
"<date> - <correspondent> - <title> - <tags>"
|
||||
"<correspondent> - <title> - <tags>"
|
||||
"<correspondent> - <title>"
|
||||
"<title>"
|
||||
"""
|
||||
|
||||
filename = os.path.basename(path)
|
||||
|
||||
# Mutate filename in-place before parsing its components
|
||||
# by applying at most one of the configured transformations.
|
||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||
@@ -417,6 +390,23 @@ class FileInfo:
|
||||
if count:
|
||||
break
|
||||
|
||||
# do this after the transforms so that the transforms can do whatever
|
||||
# with the file extension.
|
||||
filename_no_ext = os.path.splitext(filename)[0]
|
||||
|
||||
if filename_no_ext == filename and filename.startswith("."):
|
||||
# This is a very special case where there is no text before the
|
||||
# file type.
|
||||
# TODO: this should be handled better. The ext is not removed
|
||||
# because usually, files like '.pdf' are just hidden files
|
||||
# with the name pdf, but in our case, its more likely that
|
||||
# there's just no name to begin with.
|
||||
filename = ""
|
||||
# This isn't too bad either, since we'll just not match anything
|
||||
# and return an empty title. TODO: actually, this is kinda bad.
|
||||
else:
|
||||
filename = filename_no_ext
|
||||
|
||||
# Parse filename components.
|
||||
for regex in cls.REGEXES.values():
|
||||
m = regex.match(filename)
|
||||
@@ -426,5 +416,4 @@ class FileInfo:
|
||||
cls._mangle_property(properties, "correspondent")
|
||||
cls._mangle_property(properties, "title")
|
||||
cls._mangle_property(properties, "tags")
|
||||
cls._mangle_property(properties, "extension")
|
||||
return cls(**properties)
|
||||
|
@@ -6,6 +6,7 @@ import subprocess
|
||||
import tempfile
|
||||
|
||||
import dateparser
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
@@ -37,10 +38,11 @@ DATE_REGEX = re.compile(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_parser_class(doc):
|
||||
"""
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
def is_mime_type_supported(mime_type):
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type):
|
||||
|
||||
options = []
|
||||
|
||||
@@ -48,9 +50,9 @@ def get_parser_class(doc):
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
parser_test = parser_declaration["test"]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if parser_test(doc):
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
@@ -61,7 +63,28 @@ def get_parser_class(doc):
|
||||
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
|
||||
def run_convert(input_file, output_file, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
|
||||
def get_parser_class(path):
|
||||
"""
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
mime_type = magic.from_file(path, mime=True)
|
||||
|
||||
return get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
|
||||
def run_convert(input_file,
|
||||
output_file,
|
||||
density=None,
|
||||
scale=None,
|
||||
alpha=None,
|
||||
strip=False,
|
||||
trim=False,
|
||||
type=None,
|
||||
depth=None,
|
||||
extra=None,
|
||||
logging_group=None):
|
||||
|
||||
environment = os.environ.copy()
|
||||
if settings.CONVERT_MEMORY_LIMIT:
|
||||
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
|
||||
@@ -90,10 +113,13 @@ def run_unpaper(pnm, logging_group=None):
|
||||
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
||||
pnm_out)
|
||||
|
||||
logger.debug("Execute: " + " ".join(command_args), extra={'group': logging_group})
|
||||
logger.debug(f"Execute: {' '.join(command_args)}",
|
||||
extra={'group': logging_group})
|
||||
|
||||
if not subprocess.Popen(command_args, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError("Unpaper failed at {}".format(command_args))
|
||||
if not subprocess.Popen(command_args,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError(f"Unpaper failed at {command_args}")
|
||||
|
||||
return pnm_out
|
||||
|
||||
@@ -112,7 +138,8 @@ class DocumentParser(LoggingMixin):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
def get_thumbnail(self):
|
||||
@@ -126,9 +153,10 @@ class DocumentParser(LoggingMixin):
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY, "-silent", "-o5", in_path, "-out", out_path)
|
||||
args = (settings.OPTIPNG_BINARY,
|
||||
"-silent", "-o5", in_path, "-out", out_path)
|
||||
|
||||
self.log('debug', 'Execute: ' + " ".join(args))
|
||||
self.log('debug', f"Execute: {' '.join(args)}")
|
||||
|
||||
if not subprocess.Popen(args).wait() == 0:
|
||||
raise ParseError("Optipng failed at {}".format(args))
|
||||
|
@@ -76,9 +76,11 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent_id = CorrespondentField(allow_null=True, source='correspondent')
|
||||
correspondent_id = CorrespondentField(
|
||||
allow_null=True, source='correspondent')
|
||||
tags_id = TagsField(many=True, source='tags')
|
||||
document_type_id = DocumentTypeField(allow_null=True, source='document_type')
|
||||
document_type_id = DocumentTypeField(
|
||||
allow_null=True, source='document_type')
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
@@ -91,7 +93,7 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
"document_type_id",
|
||||
"title",
|
||||
"content",
|
||||
"file_type",
|
||||
"mime_type",
|
||||
"tags",
|
||||
"tags_id",
|
||||
"checksum",
|
||||
|
@@ -25,11 +25,18 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
document.tags.add(*inbox_tags)
|
||||
|
||||
|
||||
def set_correspondent(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
|
||||
def set_correspondent(sender,
|
||||
document=None,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
**kwargs):
|
||||
if document.correspondent and not replace:
|
||||
return
|
||||
|
||||
potential_correspondents = matching.match_correspondents(document.content, classifier)
|
||||
potential_correspondents = matching.match_correspondents(document.content,
|
||||
classifier)
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
if potential_correspondents:
|
||||
@@ -38,22 +45,22 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
|
||||
selected = None
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
message = "Detected {} potential correspondents, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
f"Detected {potential_count} potential correspondents, "
|
||||
f"so we've opted for {selected}",
|
||||
logging_group
|
||||
)
|
||||
else:
|
||||
message = "Detected {} potential correspondents, not assigning any correspondent"
|
||||
logger(
|
||||
message.format(potential_count),
|
||||
f"Detected {potential_count} potential correspondents, "
|
||||
f"not assigning any correspondent",
|
||||
logging_group
|
||||
)
|
||||
return
|
||||
|
||||
if selected or replace:
|
||||
logger(
|
||||
'Assigning correspondent "{}" to "{}" '.format(selected, document),
|
||||
f"Assigning correspondent {selected} to {document}",
|
||||
logging_group
|
||||
)
|
||||
|
||||
@@ -61,11 +68,18 @@ def set_correspondent(sender, document=None, logging_group=None, classifier=None
|
||||
document.save(update_fields=("correspondent",))
|
||||
|
||||
|
||||
def set_document_type(sender, document=None, logging_group=None, classifier=None, replace=False, use_first=True, **kwargs):
|
||||
def set_document_type(sender,
|
||||
document=None,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
**kwargs):
|
||||
if document.document_type and not replace:
|
||||
return
|
||||
|
||||
potential_document_type = matching.match_document_types(document.content, classifier)
|
||||
potential_document_type = matching.match_document_types(document.content,
|
||||
classifier)
|
||||
|
||||
potential_count = len(potential_document_type)
|
||||
if potential_document_type:
|
||||
@@ -75,22 +89,22 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
|
||||
|
||||
if potential_count > 1:
|
||||
if use_first:
|
||||
message = "Detected {} potential document types, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
f"Detected {potential_count} potential document types, "
|
||||
f"so we've opted for {selected}",
|
||||
logging_group
|
||||
)
|
||||
else:
|
||||
message = "Detected {} potential document types, not assigning any document type"
|
||||
logger(
|
||||
message.format(potential_count),
|
||||
f"Detected {potential_count} potential document types, "
|
||||
f"not assigning any document type",
|
||||
logging_group
|
||||
)
|
||||
return
|
||||
|
||||
if selected or replace:
|
||||
logger(
|
||||
'Assigning document type "{}" to "{}" '.format(selected, document),
|
||||
f"Assigning document type {selected} to {document}",
|
||||
logging_group
|
||||
)
|
||||
|
||||
@@ -98,14 +112,21 @@ def set_document_type(sender, document=None, logging_group=None, classifier=None
|
||||
document.save(update_fields=("document_type",))
|
||||
|
||||
|
||||
def set_tags(sender, document=None, logging_group=None, classifier=None, replace=False, **kwargs):
|
||||
def set_tags(sender,
|
||||
document=None,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
replace=False,
|
||||
**kwargs):
|
||||
if replace:
|
||||
document.tags.clear()
|
||||
current_tags = set([])
|
||||
else:
|
||||
current_tags = set(document.tags.all())
|
||||
|
||||
relevant_tags = set(matching.match_tags(document.content, classifier)) - current_tags
|
||||
matched_tags = matching.match_tags(document.content, classifier)
|
||||
|
||||
relevant_tags = set(matched_tags) - current_tags
|
||||
|
||||
if not relevant_tags:
|
||||
return
|
||||
@@ -180,12 +201,15 @@ def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal('Document {}: File {} has gone.'.format(str(instance), old_path))
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning('Document {}: Cannot rename file since target path {} already exists.'.format(str(instance), new_path))
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
|
@@ -45,7 +45,7 @@ class DocumentApiTest(APITestCase):
|
||||
dt = DocumentType.objects.create(name="dt", pk=63)
|
||||
tag = Tag.objects.create(name="t", pk=85)
|
||||
|
||||
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123")
|
||||
doc = Document.objects.create(title="WOW", content="the content", correspondent=c, document_type=dt, checksum="123", mime_type="application/pdf")
|
||||
|
||||
doc.tags.add(tag)
|
||||
|
||||
@@ -95,7 +95,7 @@ class DocumentApiTest(APITestCase):
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), file_type="pdf")
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
|
||||
|
||||
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
f.write(content_thumbnail)
|
||||
@@ -117,7 +117,7 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), file_type="pdf")
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
self.assertEqual(response.status_code, 404)
|
||||
@@ -130,9 +130,9 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
def test_document_filters(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C")
|
||||
doc1 = Document.objects.create(title="none1", checksum="A", mime_type="application/pdf")
|
||||
doc2 = Document.objects.create(title="none2", checksum="B", mime_type="application/pdf")
|
||||
doc3 = Document.objects.create(title="none3", checksum="C", mime_type="application/pdf")
|
||||
|
||||
tag_inbox = Tag.objects.create(name="t1", is_inbox_tag=True)
|
||||
tag_2 = Tag.objects.create(name="t2")
|
||||
|
@@ -15,11 +15,3 @@ class ChecksTestCase(TestCase):
|
||||
def test_changed_password_check_no_encryption(self):
|
||||
DocumentFactory.create(storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(changed_password_check(None), [])
|
||||
|
||||
@unittest.skip("I don't know how to test this")
|
||||
def test_changed_password_check_gpg_encryption_with_good_password(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("I don't know how to test this")
|
||||
def test_changed_password_check_fail(self):
|
||||
pass
|
||||
|
@@ -15,57 +15,42 @@ from ..parsers import DocumentParser, ParseError
|
||||
class TestAttributes(TestCase):
|
||||
|
||||
TAGS = ("tag1", "tag2", "tag3")
|
||||
EXTENSIONS = (
|
||||
"pdf", "png", "jpg", "jpeg", "gif", "tiff", "tif",
|
||||
"PDF", "PNG", "JPG", "JPEG", "GIF", "TIFF", "TIF",
|
||||
"PdF", "PnG", "JpG", "JPeG", "GiF", "TiFf", "TiF",
|
||||
)
|
||||
|
||||
def _test_guess_attributes_from_name(self, path, sender, title, tags):
|
||||
def _test_guess_attributes_from_name(self, filename, sender, title, tags):
|
||||
file_info = FileInfo.from_filename(filename)
|
||||
|
||||
for extension in self.EXTENSIONS:
|
||||
if sender:
|
||||
self.assertEqual(file_info.correspondent.name, sender, filename)
|
||||
else:
|
||||
self.assertIsNone(file_info.correspondent, filename)
|
||||
|
||||
f = path.format(extension)
|
||||
file_info = FileInfo.from_path(f)
|
||||
self.assertEqual(file_info.title, title, filename)
|
||||
|
||||
if sender:
|
||||
self.assertEqual(file_info.correspondent.name, sender, f)
|
||||
else:
|
||||
self.assertIsNone(file_info.correspondent, f)
|
||||
|
||||
self.assertEqual(file_info.title, title, f)
|
||||
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
|
||||
if extension.lower() == "jpeg":
|
||||
self.assertEqual(file_info.extension, "jpg", f)
|
||||
elif extension.lower() == "tif":
|
||||
self.assertEqual(file_info.extension, "tiff", f)
|
||||
else:
|
||||
self.assertEqual(file_info.extension, extension.lower(), f)
|
||||
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename)
|
||||
|
||||
def test_guess_attributes_from_name0(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Sender - Title.{}", "Sender", "Title", ())
|
||||
"Sender - Title.pdf", "Sender", "Title", ())
|
||||
|
||||
def test_guess_attributes_from_name1(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())
|
||||
"Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
|
||||
|
||||
def test_guess_attributes_from_name2(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())
|
||||
"Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
|
||||
|
||||
def test_guess_attributes_from_name3(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())
|
||||
"Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
|
||||
|
||||
def test_guess_attributes_from_name4(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())
|
||||
"Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
|
||||
|
||||
def test_guess_attributes_from_name5(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Sender - Title - tag1,tag2,tag3.{}",
|
||||
"Sender - Title - tag1,tag2,tag3.pdf",
|
||||
"Sender",
|
||||
"Title",
|
||||
self.TAGS
|
||||
@@ -73,7 +58,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name6(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
|
||||
"Spaced Sender - Title - tag1,tag2,tag3.pdf",
|
||||
"Spaced Sender",
|
||||
"Title",
|
||||
self.TAGS
|
||||
@@ -81,7 +66,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name7(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
|
||||
"Sender - Spaced Title - tag1,tag2,tag3.pdf",
|
||||
"Sender",
|
||||
"Spaced Title",
|
||||
self.TAGS
|
||||
@@ -89,7 +74,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name8(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
|
||||
"Dashed-Sender - Title - tag1,tag2,tag3.pdf",
|
||||
"Dashed-Sender",
|
||||
"Title",
|
||||
self.TAGS
|
||||
@@ -97,7 +82,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name9(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
|
||||
"Sender - Dashed-Title - tag1,tag2,tag3.pdf",
|
||||
"Sender",
|
||||
"Dashed-Title",
|
||||
self.TAGS
|
||||
@@ -105,7 +90,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name10(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
"/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
|
||||
"Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
|
||||
"Σενδερ",
|
||||
"Τιτλε",
|
||||
self.TAGS
|
||||
@@ -113,7 +98,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name_when_correspondent_empty(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
'/path/to/ - weird empty correspondent but should not break.{}',
|
||||
' - weird empty correspondent but should not break.pdf',
|
||||
None,
|
||||
'weird empty correspondent but should not break',
|
||||
()
|
||||
@@ -121,7 +106,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
'/path/to/- weird but should not break.{}',
|
||||
'- weird but should not break.pdf',
|
||||
None,
|
||||
'- weird but should not break',
|
||||
()
|
||||
@@ -129,7 +114,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
'/path/to/weird but should not break -.{}',
|
||||
'weird but should not break -.pdf',
|
||||
None,
|
||||
'weird but should not break -',
|
||||
()
|
||||
@@ -137,7 +122,7 @@ class TestAttributes(TestCase):
|
||||
|
||||
def test_guess_attributes_from_name_when_title_is_empty(self):
|
||||
self._test_guess_attributes_from_name(
|
||||
'/path/to/weird correspondent but should not break - .{}',
|
||||
'weird correspondent but should not break - .pdf',
|
||||
'weird correspondent but should not break',
|
||||
'',
|
||||
()
|
||||
@@ -149,11 +134,11 @@ class TestAttributes(TestCase):
|
||||
:return:
|
||||
"""
|
||||
|
||||
path = "Title - Correspondent - tAg1,TAG2.pdf"
|
||||
self.assertEqual(len(FileInfo.from_path(path).tags), 2)
|
||||
filename = "Title - Correspondent - tAg1,TAG2.pdf"
|
||||
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
||||
|
||||
path = "Title - Correspondent - tag1,tag2.pdf"
|
||||
self.assertEqual(len(FileInfo.from_path(path).tags), 2)
|
||||
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
||||
|
||||
self.assertEqual(Tag.objects.all().count(), 2)
|
||||
|
||||
@@ -173,13 +158,12 @@ class TestFieldPermutations(TestCase):
|
||||
]
|
||||
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
|
||||
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
|
||||
valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
|
||||
|
||||
def _test_guessed_attributes(self, filename, created=None,
|
||||
correspondent=None, title=None,
|
||||
extension=None, tags=None):
|
||||
tags=None):
|
||||
|
||||
info = FileInfo.from_path(filename)
|
||||
info = FileInfo.from_filename(filename)
|
||||
|
||||
# Created
|
||||
if created is None:
|
||||
@@ -207,68 +191,56 @@ class TestFieldPermutations(TestCase):
|
||||
filename
|
||||
)
|
||||
|
||||
# Extension
|
||||
if extension == 'jpeg':
|
||||
extension = 'jpg'
|
||||
self.assertEqual(info.extension, extension, filename)
|
||||
|
||||
def test_just_title(self):
|
||||
template = '/path/to/{title}.{extension}'
|
||||
template = '{title}.pdf'
|
||||
for title in self.valid_titles:
|
||||
for extension in self.valid_extensions:
|
||||
spec = dict(title=title, extension=extension)
|
||||
spec = dict(title=title)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent(self):
|
||||
template = '{correspondent} - {title}.pdf'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
spec = dict(correspondent=correspondent, title=title)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent(self):
|
||||
template = '/path/to/{correspondent} - {title}.{extension}'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for extension in self.valid_extensions:
|
||||
spec = dict(correspondent=correspondent, title=title,
|
||||
extension=extension)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_title_and_correspondent_and_tags(self):
|
||||
template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
|
||||
template = '{correspondent} - {title} - {tags}.pdf'
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
for extension in self.valid_extensions:
|
||||
spec = dict(correspondent=correspondent, title=title,
|
||||
tags=tags, extension=extension)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
spec = dict(correspondent=correspondent, title=title,
|
||||
tags=tags)
|
||||
filename = template.format(**spec)
|
||||
self._test_guessed_attributes(filename, **spec)
|
||||
|
||||
def test_created_and_correspondent_and_title_and_tags(self):
|
||||
|
||||
template = (
|
||||
"/path/to/{created} - "
|
||||
"{created} - "
|
||||
"{correspondent} - "
|
||||
"{title} - "
|
||||
"{tags}"
|
||||
".{extension}"
|
||||
"{tags}.pdf"
|
||||
)
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_correspondent_and_title(self):
|
||||
|
||||
template = "/path/to/{created} - {correspondent} - {title}.{extension}"
|
||||
template = "{created} - {correspondent} - {title}.pdf"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for correspondent in self.valid_correspondents:
|
||||
@@ -279,56 +251,50 @@ class TestFieldPermutations(TestCase):
|
||||
if title.lower() == title:
|
||||
continue
|
||||
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"correspondent": correspondent,
|
||||
"title": title,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title(self):
|
||||
|
||||
template = "/path/to/{created} - {title}.{extension}"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title,
|
||||
"extension": extension
|
||||
"correspondent": correspondent,
|
||||
"title": title
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title(self):
|
||||
|
||||
template = "{created} - {title}.pdf"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_created_and_title_and_tags(self):
|
||||
|
||||
template = "/path/to/{created} - {title} - {tags}.{extension}"
|
||||
template = "{created} - {title} - {tags}.pdf"
|
||||
|
||||
for created in self.valid_dates:
|
||||
for title in self.valid_titles:
|
||||
for tags in self.valid_tags:
|
||||
for extension in self.valid_extensions:
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
"extension": extension
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
spec = {
|
||||
"created": created,
|
||||
"title": title,
|
||||
"tags": tags
|
||||
}
|
||||
self._test_guessed_attributes(
|
||||
template.format(**spec), **spec)
|
||||
|
||||
def test_invalid_date_format(self):
|
||||
info = FileInfo.from_path("/path/to/06112017Z - title.pdf")
|
||||
info = FileInfo.from_filename("06112017Z - title.pdf")
|
||||
self.assertEqual(info.title, "title")
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
def test_filename_parse_transforms(self):
|
||||
|
||||
path = "/some/path/to/tag1,tag2_20190908_180610_0001.pdf"
|
||||
filename = "tag1,tag2_20190908_180610_0001.pdf"
|
||||
all_patt = re.compile("^.*$")
|
||||
none_patt = re.compile("$a")
|
||||
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
|
||||
@@ -336,50 +302,44 @@ class TestFieldPermutations(TestCase):
|
||||
repl2 = "\\2Z - " + repl1 # creation date + repl1
|
||||
|
||||
# No transformations configured (= default)
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
self.assertEqual(info.tags, ())
|
||||
self.assertIsNone(info.created)
|
||||
|
||||
# Pattern doesn't match (filename unaltered)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
|
||||
# Simple transformation (match all)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "all")
|
||||
self.assertEqual(info.extension, "gif")
|
||||
|
||||
# Multiple transformations configured (first pattern matches)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[
|
||||
(all_patt, "all.gif"),
|
||||
(all_patt, "anotherall.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "all")
|
||||
self.assertEqual(info.extension, "gif")
|
||||
|
||||
# Multiple transformations configured (second pattern matches)
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[
|
||||
(none_patt, "none.gif"),
|
||||
(all_patt, "anotherall.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "anotherall")
|
||||
self.assertEqual(info.extension, "gif")
|
||||
|
||||
# Complex transformation without date in replacement string
|
||||
with self.settings(
|
||||
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
@@ -392,9 +352,8 @@ class TestFieldPermutations(TestCase):
|
||||
(exact_patt, repl2), # <-- matches
|
||||
(exact_patt, repl1),
|
||||
(all_patt, "all.gif")]):
|
||||
info = FileInfo.from_path(path)
|
||||
info = FileInfo.from_filename(filename)
|
||||
self.assertEqual(info.title, "0001")
|
||||
self.assertEqual(info.extension, "pdf")
|
||||
self.assertEqual(len(info.tags), 2)
|
||||
self.assertEqual(info.tags[0].slug, "tag1")
|
||||
self.assertEqual(info.tags[1].slug, "tag2")
|
||||
@@ -437,6 +396,18 @@ class FaultyParser(DocumentParser):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
|
||||
if mime:
|
||||
if os.path.splitext(file)[1] == ".pdf":
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "unknown"
|
||||
else:
|
||||
return "A verbose string that describes the contents of the file"
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
@@ -462,7 +433,7 @@ class TestConsumer(TestCase):
|
||||
m = patcher.start()
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_dummy_parser,
|
||||
"test": lambda _: True,
|
||||
"mime_types": ["application/pdf"],
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
@@ -592,7 +563,7 @@ class TestConsumer(TestCase):
|
||||
def testFaultyParser(self, m):
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_faulty_parser,
|
||||
"test": lambda _: True,
|
||||
"mime_types": ["application/pdf"],
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
|
@@ -13,9 +13,12 @@ class TestDocument(TestCase):
|
||||
title="Title",
|
||||
content="content",
|
||||
checksum="checksum",
|
||||
mime_type="application/pdf"
|
||||
)
|
||||
|
||||
file_path = document.source_path
|
||||
thumb_path = document.thumbnail_path
|
||||
|
||||
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
|
||||
document.delete()
|
||||
mock_unlink.assert_any_call(file_path)
|
||||
|
@@ -31,7 +31,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -44,7 +44,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -81,7 +81,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_missing_permissions(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -111,10 +111,10 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_file_renaming_database_error(self):
|
||||
|
||||
document1 = Document.objects.create(file_type="pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
|
||||
document1 = Document.objects.create(mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_UNENCRYPTED, checksum="AAAAA")
|
||||
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.checksum = "BBBBB"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
@@ -149,7 +149,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -170,7 +170,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete_nofile(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -179,7 +179,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_directory_not_empty(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -206,7 +206,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_underscore(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -222,7 +222,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_with_dash(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -238,7 +238,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
def test_tags_malformed(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -254,7 +254,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[0]}")
|
||||
def test_tags_all(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -269,7 +269,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[1]}")
|
||||
def test_tags_out_of_bounds(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -284,7 +284,7 @@ class TestDate(TestCase):
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}")
|
||||
def test_nested_directory_cleanup(self):
|
||||
document = Document()
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
document.save()
|
||||
|
||||
@@ -309,7 +309,7 @@ class TestDate(TestCase):
|
||||
def test_format_none(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
@@ -335,7 +335,7 @@ class TestDate(TestCase):
|
||||
def test_invalid_format(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
@@ -344,7 +344,7 @@ class TestDate(TestCase):
|
||||
def test_invalid_format_key(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.file_type = "pdf"
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
14
src/documents/tests/test_index.py
Normal file
14
src/documents/tests/test_index.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.index import JsonFormatter
|
||||
|
||||
|
||||
class JsonFormatterTest(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.formatter = JsonFormatter()
|
||||
|
||||
def test_empty_fragments(self):
|
||||
self.assertListEqual(self.formatter.format([]), [])
|
||||
|
||||
|
@@ -213,7 +213,7 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
TestCase.setUp(self)
|
||||
User.objects.create_user(username='test_consumer', password='12345')
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.", file_type="pdf")
|
||||
content="I contain the keyword.", mime_type="application/pdf")
|
||||
|
||||
def test_tag_applied_any(self):
|
||||
t1 = Tag.objects.create(
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
@@ -6,6 +7,18 @@ from django.test import TestCase
|
||||
from documents.parsers import get_parser_class
|
||||
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
|
||||
if mime:
|
||||
if os.path.splitext(file)[1] == ".pdf":
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "unknown"
|
||||
else:
|
||||
return "A verbose string that describes the contents of the file"
|
||||
|
||||
|
||||
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
|
||||
class TestParserDiscovery(TestCase):
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
@@ -14,7 +27,7 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
|
||||
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -32,8 +45,8 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
|
||||
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
|
@@ -47,18 +47,30 @@ class IndexView(TemplateView):
|
||||
|
||||
class CorrespondentViewSet(ModelViewSet):
|
||||
model = Correspondent
|
||||
queryset = Correspondent.objects.annotate(document_count=Count('documents'), last_correspondence=Max('documents__created')).order_by('name')
|
||||
|
||||
queryset = Correspondent.objects.annotate(
|
||||
document_count=Count('documents'),
|
||||
last_correspondence=Max('documents__created')).order_by('name')
|
||||
|
||||
serializer_class = CorrespondentSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
filter_backends = (DjangoFilterBackend, OrderingFilter)
|
||||
filterset_class = CorrespondentFilterSet
|
||||
ordering_fields = ("name", "matching_algorithm", "match", "document_count", "last_correspondence")
|
||||
ordering_fields = (
|
||||
"name",
|
||||
"matching_algorithm",
|
||||
"match",
|
||||
"document_count",
|
||||
"last_correspondence")
|
||||
|
||||
|
||||
class TagViewSet(ModelViewSet):
|
||||
model = Tag
|
||||
queryset = Tag.objects.annotate(document_count=Count('documents')).order_by('name')
|
||||
|
||||
queryset = Tag.objects.annotate(
|
||||
document_count=Count('documents')).order_by('name')
|
||||
|
||||
serializer_class = TagSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
@@ -69,7 +81,10 @@ class TagViewSet(ModelViewSet):
|
||||
|
||||
class DocumentTypeViewSet(ModelViewSet):
|
||||
model = DocumentType
|
||||
queryset = DocumentType.objects.annotate(document_count=Count('documents')).order_by('name')
|
||||
|
||||
queryset = DocumentType.objects.annotate(
|
||||
document_count=Count('documents')).order_by('name')
|
||||
|
||||
serializer_class = DocumentTypeSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
@@ -92,10 +107,18 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
filterset_class = DocumentFilterSet
|
||||
search_fields = ("title", "correspondent__name", "content")
|
||||
ordering_fields = (
|
||||
"id", "title", "correspondent__name", "document_type__name", "created", "modified", "added", "archive_serial_number")
|
||||
"id",
|
||||
"title",
|
||||
"correspondent__name",
|
||||
"document_type__name",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"archive_serial_number")
|
||||
|
||||
def update(self, request, *args, **kwargs):
|
||||
response = super(DocumentViewSet, self).update(request, *args, **kwargs)
|
||||
response = super(DocumentViewSet, self).update(
|
||||
request, *args, **kwargs)
|
||||
index.add_or_update_document(self.get_object())
|
||||
return response
|
||||
|
||||
@@ -104,18 +127,6 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
# TODO: this should not be necessary here.
|
||||
content_types = {
|
||||
Document.TYPE_PDF: "application/pdf",
|
||||
Document.TYPE_PNG: "image/png",
|
||||
Document.TYPE_JPG: "image/jpeg",
|
||||
Document.TYPE_GIF: "image/gif",
|
||||
Document.TYPE_TIF: "image/tiff",
|
||||
Document.TYPE_CSV: "text/csv",
|
||||
Document.TYPE_MD: "text/markdown",
|
||||
Document.TYPE_TXT: "text/plain"
|
||||
}
|
||||
|
||||
doc = Document.objects.get(id=pk)
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
@@ -123,7 +134,7 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
else:
|
||||
file_handle = GnuPG.decrypted(doc.source_file)
|
||||
|
||||
response = HttpResponse(file_handle, content_type=content_types[doc.file_type])
|
||||
response = HttpResponse(file_handle, content_type=doc.mime_type)
|
||||
response["Content-Disposition"] = '{}; filename="{}"'.format(
|
||||
disposition, doc.file_name)
|
||||
return response
|
||||
@@ -150,7 +161,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@cache_control(public=False, max_age=315360000)
|
||||
def thumb(self, request, pk=None):
|
||||
try:
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file, content_type='image/png')
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
|
||||
content_type='image/png')
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document thumbnail does not exist")
|
||||
|
||||
@@ -242,5 +254,6 @@ class StatisticsView(APIView):
|
||||
def get(self, request, format=None):
|
||||
return Response({
|
||||
'documents_total': Document.objects.all().count(),
|
||||
'documents_inbox': Document.objects.filter(tags__is_inbox_tag=True).distinct().count()
|
||||
'documents_inbox': Document.objects.filter(
|
||||
tags__is_inbox_tag=True).distinct().count()
|
||||
})
|
||||
|
Reference in New Issue
Block a user