Merge branch 'dev' into feature-websockets-status
@@ -1 +1,2 @@
|
||||
from .checks import changed_password_check
|
||||
# this is here so that django finds the checks.
|
||||
from .checks import *
|
||||
|
@@ -4,12 +4,13 @@ import os
|
||||
import pickle
|
||||
import re
|
||||
|
||||
from django.conf import settings
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
from documents.models import Document, MatchingModel
|
||||
from paperless import settings
|
||||
|
||||
|
||||
class IncompatibleClassifierVersionError(Exception):
|
||||
@@ -27,7 +28,7 @@ def preprocess_content(content):
|
||||
|
||||
class DocumentClassifier(object):
|
||||
|
||||
FORMAT_VERSION = 5
|
||||
FORMAT_VERSION = 6
|
||||
|
||||
def __init__(self):
|
||||
# mtime of the model file on disk. used to prevent reloading when
|
||||
@@ -54,6 +55,8 @@ class DocumentClassifier(object):
|
||||
"Cannor load classifier, incompatible versions.")
|
||||
else:
|
||||
if self.classifier_version > 0:
|
||||
# Don't be confused by this check. It's simply here
|
||||
# so that we wont log anything on initial reload.
|
||||
logger.info("Classifier updated on disk, "
|
||||
"reloading classifier models")
|
||||
self.data_hash = pickle.load(f)
|
||||
@@ -122,9 +125,14 @@ class DocumentClassifier(object):
|
||||
labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
|
||||
|
||||
num_tags = len(labels_tags_unique)
|
||||
|
||||
# substract 1 since -1 (null) is also part of the classes.
|
||||
num_correspondents = len(set(labels_correspondent)) - 1
|
||||
num_document_types = len(set(labels_document_type)) - 1
|
||||
|
||||
# union with {-1} accounts for cases where all documents have
|
||||
# correspondents and types assigned, so -1 isnt part of labels_x, which
|
||||
# it usually is.
|
||||
num_correspondents = len(set(labels_correspondent) | {-1}) - 1
|
||||
num_document_types = len(set(labels_document_type) | {-1}) - 1
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
@@ -145,12 +153,23 @@ class DocumentClassifier(object):
|
||||
)
|
||||
data_vectorized = self.data_vectorizer.fit_transform(data)
|
||||
|
||||
self.tags_binarizer = MultiLabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)
|
||||
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logging.getLogger(__name__).debug("Training tags classifier...")
|
||||
|
||||
if num_tags == 1:
|
||||
# Special case where only one tag has auto:
|
||||
# Fallback to binary classification.
|
||||
labels_tags = [label[0] if len(label) == 1 else -1
|
||||
for label in labels_tags]
|
||||
self.tags_binarizer = LabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(
|
||||
labels_tags).ravel()
|
||||
else:
|
||||
self.tags_binarizer = MultiLabelBinarizer()
|
||||
labels_tags_vectorized = self.tags_binarizer.fit_transform(
|
||||
labels_tags)
|
||||
|
||||
self.tags_classifier = MLPClassifier(tol=0.01)
|
||||
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
|
||||
else:
|
||||
@@ -222,6 +241,16 @@ class DocumentClassifier(object):
|
||||
X = self.data_vectorizer.transform([preprocess_content(content)])
|
||||
y = self.tags_classifier.predict(X)
|
||||
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
|
||||
return tags_ids
|
||||
if type_of_target(y).startswith('multilabel'):
|
||||
# the usual case when there are multiple tags.
|
||||
return list(tags_ids)
|
||||
elif type_of_target(y) == 'binary' and tags_ids != -1:
|
||||
# This is for when we have binary classification with only one
|
||||
# tag and the result is to assign this tag.
|
||||
return [tags_ids]
|
||||
else:
|
||||
# Usually binary as well with -1 as the result, but we're
|
||||
# going to catch everything else here as well.
|
||||
return []
|
||||
else:
|
||||
return []
|
||||
|
@@ -8,14 +8,15 @@ from asgiref.sync import async_to_sync
|
||||
from channels.layers import get_channel_layer
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .file_handling import create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, \
|
||||
get_supported_file_extensions, parse_date
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@@ -58,21 +59,10 @@ class Consumer(LoggingMixin):
|
||||
raise ConsumerError("Cannot consume {}: It is not a file".format(
|
||||
self.path))
|
||||
|
||||
def pre_check_consumption_dir(self):
|
||||
if not settings.CONSUMPTION_DIR:
|
||||
raise ConsumerError(
|
||||
"The CONSUMPTION_DIR settings variable does not appear to be "
|
||||
"set.")
|
||||
|
||||
if not os.path.isdir(settings.CONSUMPTION_DIR):
|
||||
raise ConsumerError(
|
||||
"Consumption directory {} does not exist".format(
|
||||
settings.CONSUMPTION_DIR))
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
@@ -83,6 +73,7 @@ class Consumer(LoggingMixin):
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
@@ -110,7 +101,6 @@ class Consumer(LoggingMixin):
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
self.pre_check_file_exists()
|
||||
self.pre_check_consumption_dir()
|
||||
self.pre_check_directories()
|
||||
self.pre_check_duplicate()
|
||||
|
||||
@@ -145,7 +135,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.path, self.logging_group, progress_callback)
|
||||
document_parser = parser_class(self.logging_group, progress_callback)
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
@@ -153,19 +143,30 @@ class Consumer(LoggingMixin):
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(self.filename, 10, 100, 'WORKING',
|
||||
'Generating thumbnail...')
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
self._send_progress(self.filename, 20, 100, 'WORKING',
|
||||
'Getting text from document...')
|
||||
'Parsing document...')
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
document_parser.parse(self.path, mime_type)
|
||||
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(self.filename, 70, 100, 'WORKING',
|
||||
'Generating thumbnail...')
|
||||
thumbnail = document_parser.get_optimised_thumbnail(
|
||||
self.path, mime_type)
|
||||
|
||||
text = document_parser.get_text()
|
||||
self._send_progress(self.filename, 80, 100, 'WORKING',
|
||||
'Getting date from document...')
|
||||
date = document_parser.get_date()
|
||||
if not date:
|
||||
self._send_progress(self.filename, 90, 100, 'WORKING',
|
||||
'Getting date from document...')
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
self.log(
|
||||
"error",
|
||||
f"Error while consuming document {self.filename}: {e}")
|
||||
self._send_progress(self.filename, 100, 100, 'FAILED',
|
||||
"Failed: {}".format(e))
|
||||
raise ConsumerError(e)
|
||||
@@ -183,7 +184,7 @@ class Consumer(LoggingMixin):
|
||||
logging.getLogger(__name__).warning(
|
||||
"Cannot classify documents: {}.".format(e))
|
||||
classifier = None
|
||||
self._send_progress(self.filename, 85, 100, 'WORKING',
|
||||
self._send_progress(self.filename, 95, 100, 'WORKING',
|
||||
'Storing the document...')
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
@@ -200,9 +201,6 @@ class Consumer(LoggingMixin):
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
|
||||
self._send_progress(self.filename, 90, 100, 'WORKING',
|
||||
'Performing post-consumption tasks...')
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=document,
|
||||
@@ -213,14 +211,41 @@ class Consumer(LoggingMixin):
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
|
||||
# TODO: not required, since this is done by the file handling
|
||||
# logic
|
||||
create_source_path_directory(document.source_path)
|
||||
self._write(document, self.path, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
self.path, document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
document.save()
|
||||
|
||||
# Afte performing all database operations and moving files
|
||||
# into place, tell paperless where the file is.
|
||||
document.filename = os.path.basename(document.source_path)
|
||||
# Saving the document now will trigger the filename handling
|
||||
# logic.
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"The following error occured while consuming "
|
||||
f"{self.filename}: {e}"
|
||||
)
|
||||
self._send_progress(self.filename, 100, 100, 'FAILED',
|
||||
"Failed: {}".format(e))
|
||||
raise ConsumerError(e)
|
||||
@@ -250,10 +275,7 @@ class Consumer(LoggingMixin):
|
||||
created = file_info.created or date or timezone.make_aware(
|
||||
datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
else:
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
with open(self.path, "rb") as f:
|
||||
document = Document.objects.create(
|
||||
@@ -275,12 +297,6 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self.apply_overrides(document)
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
# We need to save the document twice, since we need the PK of the
|
||||
# document in order to create its filename above.
|
||||
document.save()
|
||||
|
||||
return document
|
||||
|
||||
def apply_overrides(self, document):
|
||||
@@ -299,11 +315,7 @@ class Consumer(LoggingMixin):
|
||||
for tag_id in self.override_tag_ids:
|
||||
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||
|
||||
def _write(self, document, source, target):
|
||||
def _write(self, storage_type, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
write_file.write(read_file.read())
|
||||
return
|
||||
self.log("debug", "Encrypting")
|
||||
write_file.write(GnuPG.encrypted(read_file))
|
||||
write_file.write(read_file.read())
|
||||
|
@@ -1,7 +1,9 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
@@ -10,10 +12,13 @@ def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
|
||||
def delete_empty_directories(directory):
|
||||
def delete_empty_directories(directory, root):
|
||||
if not os.path.isdir(directory):
|
||||
return
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(settings.ORIGINALS_DIR)
|
||||
root = os.path.normpath(root)
|
||||
|
||||
if not directory.startswith(root + os.path.sep):
|
||||
# don't do anything outside our originals folder.
|
||||
@@ -72,14 +77,31 @@ def generate_filename(doc):
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
doc.correspondent.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
correspondent = "none"
|
||||
|
||||
if doc.document_type:
|
||||
document_type = pathvalidate.sanitize_filename(
|
||||
doc.document_type.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
document_type = "none"
|
||||
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(doc.correspondent),
|
||||
title=slugify(doc.title),
|
||||
created=slugify(doc.created),
|
||||
title=pathvalidate.sanitize_filename(
|
||||
doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=doc.created.month if doc.created else "none",
|
||||
created_day=doc.created.day if doc.created else "none",
|
||||
added=slugify(doc.added),
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=doc.added.month if doc.added else "none",
|
||||
added_day=doc.added.day if doc.added else "none",
|
||||
@@ -101,3 +123,8 @@ def generate_filename(doc):
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def archive_name_from_filename(filename):
|
||||
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
@@ -1,59 +0,0 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
import magic
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
from documents.parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
document = forms.FileField()
|
||||
|
||||
def clean_document(self):
|
||||
document_name = self.cleaned_data.get("document").name
|
||||
|
||||
try:
|
||||
validate_filename(document_name)
|
||||
except ValidationError:
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
|
||||
document_data = self.cleaned_data.get("document").read()
|
||||
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise forms.ValidationError("This mime type is not supported.")
|
||||
|
||||
return document_name, document_data
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
to-be-consumed files to the consumption directory rather than have the
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
original_filename, data = self.cleaned_data.get("document")
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
|
||||
f.write(data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=original_filename,
|
||||
task_name=os.path.basename(original_filename)[:100])
|
@@ -4,10 +4,11 @@ from contextlib import contextmanager
|
||||
|
||||
from django.conf import settings
|
||||
from whoosh import highlight
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
from whoosh.index import create_in, exists_in, open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
|
||||
@@ -59,33 +60,49 @@ def get_schema():
|
||||
id=NUMERIC(stored=True, unique=True, numtype=int),
|
||||
title=TEXT(stored=True),
|
||||
content=TEXT(),
|
||||
correspondent=TEXT(stored=True)
|
||||
correspondent=TEXT(stored=True),
|
||||
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
|
||||
type=TEXT(stored=True),
|
||||
created=DATETIME(stored=True, sortable=True),
|
||||
modified=DATETIME(stored=True, sortable=True),
|
||||
added=DATETIME(stored=True, sortable=True),
|
||||
)
|
||||
|
||||
|
||||
def open_index(recreate=False):
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR)
|
||||
else:
|
||||
# TODO: this is not thread safe. If 2 instances try to create the index
|
||||
# at the same time, this fails. This currently prevents parallel
|
||||
# tests.
|
||||
if not os.path.isdir(settings.INDEX_DIR):
|
||||
os.makedirs(settings.INDEX_DIR, exist_ok=True)
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR, schema=get_schema())
|
||||
except Exception as e:
|
||||
logger.error(f"Error while opening the index: {e}, recreating.")
|
||||
|
||||
if not os.path.isdir(settings.INDEX_DIR):
|
||||
os.makedirs(settings.INDEX_DIR, exist_ok=True)
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
# TODO: this line caused many issues all around, since:
|
||||
# We need to make sure that this method does not get called with
|
||||
# deserialized documents (i.e, document objects that don't come from
|
||||
# Django's ORM interfaces directly.
|
||||
logger.debug("Indexing {}...".format(doc))
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
content=doc.content,
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
||||
tag=tags if tags else None,
|
||||
type=doc.document_type.name if doc.document_type else None,
|
||||
created=doc.created,
|
||||
added=doc.added,
|
||||
modified=doc.modified,
|
||||
)
|
||||
|
||||
|
||||
def remove_document(writer, doc):
|
||||
# TODO: see above.
|
||||
logger.debug("Removing {} from index...".format(doc))
|
||||
writer.delete_by_term('id', doc.pk)
|
||||
|
||||
@@ -103,16 +120,27 @@ def remove_document_from_index(document):
|
||||
|
||||
|
||||
@contextmanager
|
||||
def query_page(ix, query, page):
|
||||
def query_page(ix, querystring, page):
|
||||
searcher = ix.searcher()
|
||||
try:
|
||||
query_parser = MultifieldParser(["content", "title", "correspondent"],
|
||||
ix.schema).parse(query)
|
||||
result_page = searcher.search_page(query_parser, page)
|
||||
qp = MultifieldParser(
|
||||
["content", "title", "correspondent", "tag", "type"],
|
||||
ix.schema)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
|
||||
q = qp.parse(querystring)
|
||||
result_page = searcher.search_page(q, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
yield result_page
|
||||
|
||||
corrected = searcher.correct_query(q, querystring)
|
||||
if corrected.query != q:
|
||||
corrected_query = corrected.string
|
||||
else:
|
||||
corrected_query = None
|
||||
|
||||
yield result_page, corrected_query
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
@@ -1,9 +1,14 @@
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class PaperlessHandler(logging.Handler):
|
||||
def emit(self, record):
|
||||
if settings.DISABLE_DBHANDLER:
|
||||
return
|
||||
|
||||
# We have to do the import here or Django will barf when it tries to
|
||||
# load this because the apps aren't loaded at that point
|
||||
from .models import Log
|
||||
@@ -23,10 +28,10 @@ class LoggingMixin:
|
||||
def renew_logging_group(self):
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
def log(self, level, message, **kwargs):
|
||||
target = ".".join([self.__class__.__module__, self.__class__.__name__])
|
||||
logger = logging.getLogger(target)
|
||||
|
||||
getattr(logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
}, **kwargs)
|
||||
|
@@ -17,16 +17,6 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
|
||||
parser.add_argument(
|
||||
"from",
|
||||
choices=("gpg", "unencrypted"),
|
||||
help="The state you want to change your documents from"
|
||||
)
|
||||
parser.add_argument(
|
||||
"to",
|
||||
choices=("gpg", "unencrypted"),
|
||||
help="The state you want to change your documents to"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--passphrase",
|
||||
help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
|
||||
@@ -50,11 +40,6 @@ class Command(BaseCommand):
|
||||
except KeyboardInterrupt:
|
||||
return
|
||||
|
||||
if options["from"] == options["to"]:
|
||||
raise CommandError(
|
||||
'The "from" and "to" values can\'t be the same.'
|
||||
)
|
||||
|
||||
passphrase = options["passphrase"] or settings.PASSPHRASE
|
||||
if not passphrase:
|
||||
raise CommandError(
|
||||
@@ -62,10 +47,7 @@ class Command(BaseCommand):
|
||||
"by declaring it in your environment or your config."
|
||||
)
|
||||
|
||||
if options["from"] == "gpg" and options["to"] == "unencrypted":
|
||||
self.__gpg_to_unencrypted(passphrase)
|
||||
elif options["from"] == "unencrypted" and options["to"] == "gpg":
|
||||
self.__unencrypted_to_gpg(passphrase)
|
||||
self.__gpg_to_unencrypted(passphrase)
|
||||
|
||||
@staticmethod
|
||||
def __gpg_to_unencrypted(passphrase):
|
||||
@@ -79,42 +61,28 @@ class Command(BaseCommand):
|
||||
document).encode('utf-8'), "green"))
|
||||
|
||||
old_paths = [document.source_path, document.thumbnail_path]
|
||||
|
||||
raw_document = GnuPG.decrypted(document.source_file, passphrase)
|
||||
raw_thumb = GnuPG.decrypted(document.thumbnail_file, passphrase)
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
ext = os.path.splitext(document.filename)[1]
|
||||
|
||||
if not ext == '.gpg':
|
||||
raise CommandError(
|
||||
f"Abort: encrypted file {document.source_path} does not "
|
||||
f"end with .gpg")
|
||||
|
||||
document.filename = os.path.splitext(document.filename)[0]
|
||||
|
||||
with open(document.source_path, "wb") as f:
|
||||
f.write(raw_document)
|
||||
|
||||
with open(document.thumbnail_path, "wb") as f:
|
||||
f.write(raw_thumb)
|
||||
|
||||
document.save(update_fields=("storage_type",))
|
||||
|
||||
for path in old_paths:
|
||||
os.unlink(path)
|
||||
|
||||
@staticmethod
|
||||
def __unencrypted_to_gpg(passphrase):
|
||||
|
||||
unencrypted_files = Document.objects.filter(
|
||||
storage_type=Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
|
||||
for document in unencrypted_files:
|
||||
|
||||
print(coloured("Encrypting {}".format(document), "green"))
|
||||
|
||||
old_paths = [document.source_path, document.thumbnail_path]
|
||||
with open(document.source_path, "rb") as raw_document:
|
||||
with open(document.thumbnail_path, "rb") as raw_thumb:
|
||||
document.storage_type = Document.STORAGE_TYPE_GPG
|
||||
with open(document.source_path, "wb") as f:
|
||||
f.write(GnuPG.encrypted(raw_document, passphrase))
|
||||
with open(document.thumbnail_path, "wb") as f:
|
||||
f.write(GnuPG.encrypted(raw_thumb, passphrase))
|
||||
|
||||
document.save(update_fields=("storage_type",))
|
||||
document.save(update_fields=("storage_type", "filename"))
|
||||
|
||||
for path in old_paths:
|
||||
os.unlink(path)
|
128
src/documents/management/commands/document_archiver.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import hashlib
|
||||
import multiprocessing
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from ... import index
|
||||
from ...file_handling import create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
from ...parsers import get_parser_class_for_mime_type
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def handle_document(document_id):
|
||||
document = Document.objects.get(id=document_id)
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
parser = parser_class(logging_group=uuid.uuid4())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
|
||||
if parser.get_archive_path():
|
||||
with transaction.atomic():
|
||||
with open(parser.get_archive_path(), 'rb') as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# i'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# we also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text()
|
||||
)
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while parsing document {document}: {str(e)}")
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current classification model, assigns correspondents, tags
|
||||
and document types to all documents, effectively allowing you to
|
||||
back-tag all previously indexed documents with metadata created (or
|
||||
modified) after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-f", "--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Recreates the archived document for documents that already "
|
||||
"have an archived version."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--document",
|
||||
default=None,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Specify the ID of a document, and this command will only "
|
||||
"run on this specific document."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
overwrite = options["overwrite"]
|
||||
|
||||
if options['document']:
|
||||
documents = Document.objects.filter(pk=options['document'])
|
||||
else:
|
||||
documents = Document.objects.all()
|
||||
|
||||
document_ids = list(map(
|
||||
lambda doc: doc.id,
|
||||
filter(
|
||||
lambda d: overwrite or not d.archive_checksum,
|
||||
documents
|
||||
)
|
||||
))
|
||||
|
||||
# Note to future self: this prevents django from reusing database
|
||||
# conncetions between processes, which is bad and does not work
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
try:
|
||||
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||
list(tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
handle_document,
|
||||
document_ids
|
||||
),
|
||||
total=len(document_ids)
|
||||
))
|
||||
except KeyboardInterrupt:
|
||||
print("Aborting...")
|
@@ -1,37 +1,104 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
from documents.models import Tag
|
||||
from documents.parsers import is_file_ext_supported
|
||||
|
||||
try:
|
||||
from inotify_simple import INotify, flags
|
||||
from inotifyrecursive import INotify, flags
|
||||
except ImportError:
|
||||
INotify = flags = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _tags_from_path(filepath):
|
||||
"""Walk up the directory tree from filepath to CONSUMPTION_DIr
|
||||
and get or create Tag IDs for every directory.
|
||||
"""
|
||||
tag_ids = set()
|
||||
path_parts = Path(filepath).relative_to(
|
||||
settings.CONSUMPTION_DIR).parent.parts
|
||||
for part in path_parts:
|
||||
tag_ids.add(Tag.objects.get_or_create(
|
||||
slug=slugify(part),
|
||||
defaults={"name": part},
|
||||
)[0].pk)
|
||||
|
||||
return tag_ids
|
||||
|
||||
|
||||
def _consume(filepath):
|
||||
if os.path.isdir(filepath):
|
||||
return
|
||||
|
||||
if not os.path.isfile(filepath):
|
||||
logger.debug(
|
||||
f"Not consuming file {filepath}: File has moved.")
|
||||
return
|
||||
|
||||
if not is_file_ext_supported(os.path.splitext(filepath)[1]):
|
||||
logger.debug(
|
||||
f"Not consuming file {filepath}: Unknown file extension.")
|
||||
return
|
||||
|
||||
tag_ids = None
|
||||
try:
|
||||
if settings.CONSUMER_SUBDIRS_AS_TAGS:
|
||||
tag_ids = _tags_from_path(filepath)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error creating tags from path: {}".format(e))
|
||||
|
||||
try:
|
||||
async_task("documents.tasks.consume_file",
|
||||
filepath,
|
||||
override_tag_ids=tag_ids if tag_ids else None,
|
||||
task_name=os.path.basename(filepath)[:100])
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
# This is also what the test case is listening for to check for
|
||||
# errors.
|
||||
logger.error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
|
||||
|
||||
def _consume_wait_unmodified(file, num_tries=20, wait_time=1):
|
||||
mtime = -1
|
||||
current_try = 0
|
||||
while current_try < num_tries:
|
||||
try:
|
||||
new_mtime = os.stat(file).st_mtime
|
||||
except FileNotFoundError:
|
||||
logger.debug(f"File {file} moved while waiting for it to remain "
|
||||
f"unmodified.")
|
||||
return
|
||||
if new_mtime == mtime:
|
||||
_consume(file)
|
||||
return
|
||||
mtime = new_mtime
|
||||
sleep(wait_time)
|
||||
current_try += 1
|
||||
|
||||
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
|
||||
|
||||
|
||||
class Handler(FileSystemEventHandler):
|
||||
|
||||
def _consume(self, file):
|
||||
if os.path.isfile(file):
|
||||
try:
|
||||
async_task("documents.tasks.consume_file",
|
||||
file,
|
||||
task_name=os.path.basename(file)[:100])
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
logging.getLogger(__name__).error(
|
||||
"Error while consuming document: {}".format(e))
|
||||
|
||||
def on_created(self, event):
|
||||
self._consume(event.src_path)
|
||||
_consume_wait_unmodified(event.src_path)
|
||||
|
||||
def on_moved(self, event):
|
||||
self._consume(event.src_path)
|
||||
_consume_wait_unmodified(event.dest_path)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
@@ -40,12 +107,15 @@ class Command(BaseCommand):
|
||||
consumption directory.
|
||||
"""
|
||||
|
||||
# This is here primarily for the tests and is irrelevant in production.
|
||||
stop_flag = False
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
self.verbosity = 0
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.observer = None
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
@@ -54,38 +124,81 @@ class Command(BaseCommand):
|
||||
nargs="?",
|
||||
help="The consumption directory."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.verbosity = options["verbosity"]
|
||||
directory = options["directory"]
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
"Starting document consumer at {}".format(
|
||||
directory
|
||||
)
|
||||
parser.add_argument(
|
||||
"--oneshot",
|
||||
action="store_true",
|
||||
help="Run only once."
|
||||
)
|
||||
|
||||
# Consume all files as this is not done initially by the watchdog
|
||||
for entry in os.scandir(directory):
|
||||
if entry.is_file():
|
||||
async_task("documents.tasks.consume_file",
|
||||
entry.path,
|
||||
task_name=os.path.basename(entry.path)[:100])
|
||||
def handle(self, *args, **options):
|
||||
directory = options["directory"]
|
||||
recursive = settings.CONSUMER_RECURSIVE
|
||||
|
||||
# Start the watchdog. Woof!
|
||||
if settings.CONSUMER_POLLING > 0:
|
||||
logging.getLogger(__name__).info(
|
||||
"Using polling instead of file system notifications.")
|
||||
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
if not directory:
|
||||
raise CommandError(
|
||||
"CONSUMPTION_DIR does not appear to be set."
|
||||
)
|
||||
|
||||
if not os.path.isdir(directory):
|
||||
raise CommandError(
|
||||
f"Consumption directory {directory} does not exist")
|
||||
|
||||
if recursive:
|
||||
for dirpath, _, filenames in os.walk(directory):
|
||||
for filename in filenames:
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
_consume(filepath)
|
||||
else:
|
||||
observer = Observer()
|
||||
event_handler = Handler()
|
||||
observer.schedule(event_handler, directory, recursive=True)
|
||||
observer.start()
|
||||
for entry in os.scandir(directory):
|
||||
_consume(entry.path)
|
||||
|
||||
if options["oneshot"]:
|
||||
return
|
||||
|
||||
if settings.CONSUMER_POLLING == 0 and INotify:
|
||||
self.handle_inotify(directory, recursive)
|
||||
else:
|
||||
self.handle_polling(directory, recursive)
|
||||
|
||||
logger.debug("Consumer exiting.")
|
||||
|
||||
def handle_polling(self, directory, recursive):
|
||||
logging.getLogger(__name__).info(
|
||||
f"Polling directory for changes: {directory}")
|
||||
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
self.observer.schedule(Handler(), directory, recursive=recursive)
|
||||
self.observer.start()
|
||||
try:
|
||||
while observer.is_alive():
|
||||
observer.join(1)
|
||||
while self.observer.is_alive():
|
||||
self.observer.join(1)
|
||||
if self.stop_flag:
|
||||
self.observer.stop()
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
self.observer.stop()
|
||||
self.observer.join()
|
||||
|
||||
def handle_inotify(self, directory, recursive):
|
||||
logging.getLogger(__name__).info(
|
||||
f"Using inotify to watch directory for changes: {directory}")
|
||||
|
||||
inotify = INotify()
|
||||
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO
|
||||
if recursive:
|
||||
descriptor = inotify.add_watch_recursive(directory, inotify_flags)
|
||||
else:
|
||||
descriptor = inotify.add_watch(directory, inotify_flags)
|
||||
|
||||
try:
|
||||
while not self.stop_flag:
|
||||
for event in inotify.read(timeout=1000):
|
||||
if recursive:
|
||||
path = inotify.get_path(event.wd)
|
||||
else:
|
||||
path = directory
|
||||
filepath = os.path.join(path, event.name)
|
||||
_consume(filepath)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
inotify.rm_watch(descriptor)
|
||||
inotify.close()
|
||||
|
@@ -7,7 +7,8 @@ from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...mixins import Renderable
|
||||
|
||||
@@ -22,13 +23,6 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
parser.add_argument(
|
||||
"--legacy",
|
||||
action="store_true",
|
||||
help="Don't try to export all of the document data, just dump the "
|
||||
"original document files out in a format that makes "
|
||||
"re-consuming them easy."
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
@@ -44,10 +38,10 @@ class Command(Renderable, BaseCommand):
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
if options["legacy"]:
|
||||
self.dump_legacy()
|
||||
else:
|
||||
self.dump()
|
||||
if os.listdir(self.target):
|
||||
raise CommandError("That directory is not empty.")
|
||||
|
||||
self.dump()
|
||||
|
||||
def dump(self):
|
||||
|
||||
@@ -63,34 +57,56 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
print(f"Exporting: {document}")
|
||||
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
filename_counter = 0
|
||||
while True:
|
||||
original_name = document.get_public_filename(
|
||||
counter=filename_counter)
|
||||
original_target = os.path.join(self.target, original_name)
|
||||
|
||||
thumbnail_name = unique_filename + "-thumbnail.png"
|
||||
if not os.path.exists(original_target):
|
||||
break
|
||||
else:
|
||||
filename_counter += 1
|
||||
|
||||
thumbnail_name = original_name + "-thumbnail.png"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_FILE_NAME] = original_name
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
print(f"Exporting: {file_target}")
|
||||
if os.path.exists(document.archive_path):
|
||||
archive_name = document.get_public_filename(
|
||||
archive=True, counter=filename_counter, suffix="_archive")
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
|
||||
with open(file_target, "wb") as f:
|
||||
with open(original_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
os.utime(file_target, times=(t, t))
|
||||
os.utime(original_target, times=(t, t))
|
||||
|
||||
with open(thumbnail_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.thumbnail_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
if archive_target:
|
||||
with open(archive_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.archive_path))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
|
||||
shutil.copy(document.source_path, file_target)
|
||||
shutil.copy(document.source_path, original_target)
|
||||
shutil.copy(document.thumbnail_path, thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
shutil.copy(document.archive_path, archive_target)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
||||
@@ -102,33 +118,3 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json"), "w") as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
def dump_legacy(self):
|
||||
|
||||
for document in Document.objects.all():
|
||||
|
||||
target = os.path.join(
|
||||
self.target, self._get_legacy_file_name(document))
|
||||
|
||||
print("Exporting: {}".format(target))
|
||||
|
||||
with open(target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.source_file))
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
os.utime(target, times=(t, t))
|
||||
|
||||
@staticmethod
|
||||
def _get_legacy_file_name(doc):
|
||||
|
||||
if not doc.correspondent and not doc.title:
|
||||
return os.path.basename(doc.source_path)
|
||||
|
||||
created = doc.created.strftime("%Y%m%d%H%M%SZ")
|
||||
tags = ",".join([t.slug for t in doc.tags.all()])
|
||||
|
||||
if tags:
|
||||
return "{} - {} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, tags, doc.file_type)
|
||||
|
||||
return "{} - {} - {}{}".format(
|
||||
created, doc.correspondent, doc.title, doc.file_type)
|
||||
|
@@ -7,8 +7,8 @@ from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
|
||||
@@ -79,25 +79,41 @@ class Command(Renderable, BaseCommand):
|
||||
'appear to be in the source directory.'.format(doc_file)
|
||||
)
|
||||
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
if not os.path.exists(os.path.join(self.source, archive_file)):
|
||||
raise CommandError(
|
||||
f"The manifest file refers to {archive_file} which "
|
||||
f"does not appear to be in the source directory."
|
||||
)
|
||||
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
if settings.PASSPHRASE:
|
||||
storage_type = Document.STORAGE_TYPE_GPG
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
document.storage_type = storage_type
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
archive_path = os.path.join(self.source, archive_file)
|
||||
else:
|
||||
archive_path = None
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
@@ -105,23 +121,10 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
if settings.PASSPHRASE:
|
||||
|
||||
with open(document_path, "rb") as unencrypted:
|
||||
with open(document.source_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
doc_file, document.source_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
with open(thumbnail_path, "rb") as unencrypted:
|
||||
with open(document.thumbnail_path, "wb") as encrypted:
|
||||
print("Encrypting {} and saving it to {}".format(
|
||||
thumb_file, document.thumbnail_path))
|
||||
encrypted.write(GnuPG.encrypted(unencrypted))
|
||||
|
||||
else:
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
@@ -5,23 +5,6 @@ from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
def make_index(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
documents = Document.objects.all()
|
||||
print()
|
||||
try:
|
||||
print(" --> Creating document index...")
|
||||
from whoosh.writing import AsyncWriter
|
||||
from documents import index
|
||||
ix = index.open_index(recreate=True)
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
except ImportError:
|
||||
# index may not be relevant anymore
|
||||
print(" --> Cannot create document index.")
|
||||
|
||||
|
||||
def logs_set_default_group(apps, schema_editor):
|
||||
Log = apps.get_model('documents', 'Log')
|
||||
for log in Log.objects.all():
|
||||
@@ -99,8 +82,4 @@ class Migration(migrations.Migration):
|
||||
code=django.db.migrations.operations.special.RunPython.noop,
|
||||
reverse_code=logs_set_default_group
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=make_index,
|
||||
reverse_code=django.db.migrations.operations.special.RunPython.noop,
|
||||
),
|
||||
]
|
||||
|
26
src/documents/migrations/1004_sanity_check_schedule.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-25 14:53
|
||||
|
||||
from django.db import migrations
|
||||
from django.db.migrations import RunPython
|
||||
from django_q.models import Schedule
|
||||
from django_q.tasks import schedule
|
||||
|
||||
|
||||
def add_schedules(apps, schema_editor):
|
||||
schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
|
||||
|
||||
|
||||
def remove_schedules(apps, schema_editor):
|
||||
Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1003_mime_types'),
|
||||
('django_q', '0013_task_attempt_count'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
RunPython(add_schedules, remove_schedules)
|
||||
]
|
23
src/documents/migrations/1005_checksums.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-29 00:48
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1004_sanity_check_schedule'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_checksum',
|
||||
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
|
||||
),
|
||||
]
|
@@ -1,17 +1,21 @@
|
||||
# coding=utf-8
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
import pathvalidate
|
||||
|
||||
import dateutil.parser
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
|
||||
from documents.file_handling import archive_name_from_filename
|
||||
from documents.parsers import get_default_file_extension
|
||||
|
||||
|
||||
class MatchingModel(models.Model):
|
||||
|
||||
@@ -157,9 +161,15 @@ class Document(models.Model):
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text="The checksum of the original document (before it was "
|
||||
"encrypted). We use this to prevent duplicate document "
|
||||
"imports."
|
||||
help_text="The checksum of the original document."
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="The checksum of the archived document."
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
@@ -198,13 +208,11 @@ class Document(models.Model):
|
||||
ordering = ("correspondent", "title")
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y%m%d%H%M%S")
|
||||
created = datetime.date.isoformat(self.created)
|
||||
if self.correspondent and self.title:
|
||||
return "{}: {} - {}".format(
|
||||
created, self.correspondent, self.title)
|
||||
if self.correspondent or self.title:
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
return f"{created} {self.correspondent} {self.title}"
|
||||
else:
|
||||
return f"{created} {self.title}"
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
@@ -225,12 +233,40 @@ class Document(models.Model):
|
||||
return open(self.source_path, "rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + self.file_type
|
||||
def archive_path(self):
|
||||
if self.filename:
|
||||
fname = archive_name_from_filename(self.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(self.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
@property
|
||||
def archive_file(self):
|
||||
return open(self.archive_path, "rb")
|
||||
|
||||
def get_public_filename(self, archive=False, counter=0, suffix=None):
|
||||
result = str(self)
|
||||
|
||||
if counter:
|
||||
result += f"_{counter:02}"
|
||||
|
||||
if suffix:
|
||||
result += suffix
|
||||
|
||||
if archive:
|
||||
result += ".pdf"
|
||||
else:
|
||||
result += self.file_type
|
||||
|
||||
return pathvalidate.sanitize_filename(result, replacement_text="-")
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
return mimetypes.guess_extension(str(self.mime_type))
|
||||
return get_default_file_extension(self.mime_type)
|
||||
|
||||
@property
|
||||
def thumbnail_path(self):
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
@@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type):
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
|
||||
|
||||
def get_default_file_extension(mime_type):
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
return supported_mime_types[mime_type]
|
||||
|
||||
ext = mimetypes.guess_extension(mime_type)
|
||||
if ext:
|
||||
return ext
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def is_file_ext_supported(ext):
|
||||
if ext:
|
||||
return ext.lower() in get_supported_file_extensions()
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def get_supported_file_extensions():
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type):
|
||||
|
||||
options = []
|
||||
@@ -107,21 +142,59 @@ def run_convert(input_file,
|
||||
raise ParseError("Convert failed at {}".format(args))
|
||||
|
||||
|
||||
def run_unpaper(pnm, logging_group=None):
|
||||
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
|
||||
def parse_date(filename, text):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
||||
pnm_out)
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug(f"Execute: {' '.join(command_args)}",
|
||||
extra={'group': logging_group})
|
||||
date = None
|
||||
|
||||
if not subprocess.Popen(command_args,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError(f"Unpaper failed at {command_args}")
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
|
||||
return pnm_out
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
for m in re.finditer(DATE_REGEX, filename):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
return date
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
return date
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
@@ -134,27 +207,36 @@ class DocumentParser(LoggingMixin):
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group, progress_callback):
|
||||
def __init__(self, logging_group, progress_callback):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
self.archive_path = None
|
||||
self.text = None
|
||||
self.date = None
|
||||
self.progress_callback = progress_callback
|
||||
|
||||
def get_thumbnail(self):
|
||||
def parse(self, document_path, mime_type):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_archive_path(self):
|
||||
return self.archive_path
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
thumbnail = self.get_thumbnail(document_path, mime_type)
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY,
|
||||
"-silent", "-o5", in_path, "-out", out_path)
|
||||
"-silent", "-o5", thumbnail, "-out", out_path)
|
||||
|
||||
self.log('debug', f"Execute: {' '.join(args)}")
|
||||
|
||||
@@ -163,97 +245,13 @@ class DocumentParser(LoggingMixin):
|
||||
|
||||
return out_path
|
||||
else:
|
||||
return in_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
return thumbnail
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
return self.text
|
||||
|
||||
def get_date(self):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
date = None
|
||||
date_string = None
|
||||
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
title = os.path.basename(self.document_path)
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
self.log("info", "Checking document title for date")
|
||||
for m in re.finditer(DATE_REGEX, title):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {} "
|
||||
"from document title"
|
||||
"".format(date.isoformat(), date_string)
|
||||
)
|
||||
return date
|
||||
|
||||
try:
|
||||
# getting text after checking filename will save time if only
|
||||
# looking at the filename instead of the whole text
|
||||
text = self.get_text()
|
||||
except ParseError:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
if date is not None:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {}".format(
|
||||
date.isoformat(),
|
||||
date_string
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
return self.date
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
|
117
src/documents/sanity_checker.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
|
||||
|
||||
class SanityMessage:
|
||||
message = None
|
||||
|
||||
|
||||
class SanityWarning(SanityMessage):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return f"Warning: {self.message}"
|
||||
|
||||
|
||||
class SanityError(SanityMessage):
|
||||
def __init__(self, message):
|
||||
self.message = message
|
||||
|
||||
def __str__(self):
|
||||
return f"ERROR: {self.message}"
|
||||
|
||||
|
||||
class SanityFailedError(Exception):
|
||||
|
||||
def __init__(self, messages):
|
||||
self.messages = messages
|
||||
|
||||
def __str__(self):
|
||||
message_string = "\n".join([str(m) for m in self.messages])
|
||||
return (
|
||||
f"The following issuse were found by the sanity checker:\n"
|
||||
f"{message_string}\n\n===============\n\n")
|
||||
|
||||
|
||||
def check_sanity():
|
||||
messages = []
|
||||
|
||||
present_files = []
|
||||
for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
|
||||
for f in files:
|
||||
present_files.append(os.path.normpath(os.path.join(root, f)))
|
||||
|
||||
for doc in Document.objects.all():
|
||||
# Check sanity of the thumbnail
|
||||
if not os.path.isfile(doc.thumbnail_path):
|
||||
messages.append(SanityError(
|
||||
f"Thumbnail of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.thumbnail_path))
|
||||
try:
|
||||
with doc.thumbnail_file as f:
|
||||
f.read()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read thumbnail file of document {doc.pk}: {e}"
|
||||
))
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
if not os.path.isfile(doc.source_path):
|
||||
messages.append(SanityError(
|
||||
f"Original of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
try:
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||
else:
|
||||
if not checksum == doc.checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum:
|
||||
if not os.path.isfile(doc.archive_path):
|
||||
messages.append(SanityError(
|
||||
f"Archived version of document {doc.pk} does not exist."
|
||||
))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
try:
|
||||
with doc.archive_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read archive file of document {doc.pk}: {e}"
|
||||
))
|
||||
else:
|
||||
if not checksum == doc.archive_checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of archive {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.append(SanityWarning(
|
||||
f"Document {doc.pk} has no content."
|
||||
))
|
||||
|
||||
for extra_file in present_files:
|
||||
messages.append(SanityWarning(
|
||||
f"Orphaned file in media dir: {extra_file}"
|
||||
))
|
||||
|
||||
return messages
|
@@ -1,6 +1,9 @@
|
||||
import magic
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
from rest_framework import serializers
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log, DocumentType
|
||||
from .parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
@@ -76,11 +79,9 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent_id = CorrespondentField(
|
||||
allow_null=True, source='correspondent')
|
||||
tags_id = TagsField(many=True, source='tags')
|
||||
document_type_id = DocumentTypeField(
|
||||
allow_null=True, source='document_type')
|
||||
correspondent = CorrespondentField(allow_null=True)
|
||||
tags = TagsField(many=True)
|
||||
document_type = DocumentTypeField(allow_null=True)
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
@@ -88,19 +89,13 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
fields = (
|
||||
"id",
|
||||
"correspondent",
|
||||
"correspondent_id",
|
||||
"document_type",
|
||||
"document_type_id",
|
||||
"title",
|
||||
"content",
|
||||
"mime_type",
|
||||
"tags",
|
||||
"tags_id",
|
||||
"checksum",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
"file_name",
|
||||
"archive_serial_number"
|
||||
)
|
||||
|
||||
@@ -116,3 +111,82 @@ class LogSerializer(serializers.ModelSerializer):
|
||||
"group",
|
||||
"level"
|
||||
)
|
||||
|
||||
|
||||
class PostDocumentSerializer(serializers.Serializer):
|
||||
|
||||
document = serializers.FileField(
|
||||
label="Document",
|
||||
write_only=True,
|
||||
)
|
||||
|
||||
title = serializers.CharField(
|
||||
label="Title",
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
correspondent = serializers.PrimaryKeyRelatedField(
|
||||
queryset=Correspondent.objects.all(),
|
||||
label="Correspondent",
|
||||
allow_null=True,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
document_type = serializers.PrimaryKeyRelatedField(
|
||||
queryset=DocumentType.objects.all(),
|
||||
label="Document type",
|
||||
allow_null=True,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
tags = serializers.PrimaryKeyRelatedField(
|
||||
many=True,
|
||||
queryset=Tag.objects.all(),
|
||||
label="Tags",
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
def validate_document(self, document):
|
||||
|
||||
try:
|
||||
validate_filename(document.name)
|
||||
except ValidationError:
|
||||
raise serializers.ValidationError("Invalid filename.")
|
||||
|
||||
document_data = document.file.read()
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise serializers.ValidationError(
|
||||
"This file type is not supported.")
|
||||
|
||||
return document.name, document_data
|
||||
|
||||
def validate_title(self, title):
|
||||
if title:
|
||||
return title
|
||||
else:
|
||||
# do not return empty strings.
|
||||
return None
|
||||
|
||||
def validate_correspondent(self, correspondent):
|
||||
if correspondent:
|
||||
return correspondent.id
|
||||
else:
|
||||
return None
|
||||
|
||||
def validate_document_type(self, document_type):
|
||||
if document_type:
|
||||
return document_type.id
|
||||
else:
|
||||
return None
|
||||
|
||||
def validate_tags(self, tags):
|
||||
if tags:
|
||||
return [tag.id for tag in tags]
|
||||
else:
|
||||
return None
|
||||
|
@@ -2,3 +2,4 @@
|
||||
# for exporting/importing commands
|
||||
EXPORTER_FILE_NAME = "__exported_file_name__"
|
||||
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
|
||||
EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
|
||||
|
@@ -9,10 +9,11 @@ from django.contrib.contenttypes.models import ContentType
|
||||
from django.db import models, DatabaseError
|
||||
from django.dispatch import receiver
|
||||
from django.utils import timezone
|
||||
from rest_framework.reverse import reverse
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory
|
||||
create_source_path_directory, archive_name_from_filename
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@@ -156,11 +157,11 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
Popen((
|
||||
settings.POST_CONSUME_SCRIPT,
|
||||
str(document.pk),
|
||||
document.file_name,
|
||||
document.source_path,
|
||||
document.thumbnail_path,
|
||||
None,
|
||||
None,
|
||||
document.get_public_filename(),
|
||||
os.path.normpath(document.source_path),
|
||||
os.path.normpath(document.thumbnail_path),
|
||||
reverse("document-download", kwargs={"pk": document.pk}),
|
||||
reverse("document-thumb", kwargs={"pk": document.pk}),
|
||||
str(document.correspondent),
|
||||
str(",".join(document.tags.all().values_list("slug", flat=True)))
|
||||
)).wait()
|
||||
@@ -168,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
for f in (instance.source_path, instance.thumbnail_path):
|
||||
try:
|
||||
os.unlink(f)
|
||||
except FileNotFoundError:
|
||||
pass # The file's already gone, so we're cool with it.
|
||||
for f in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if os.path.isfile(f):
|
||||
try:
|
||||
os.unlink(f)
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Deleted file {f}.")
|
||||
except OSError as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"While deleting document {str(instance)}, the file "
|
||||
f"{f} could not be deleted: {e}"
|
||||
)
|
||||
|
||||
delete_empty_directories(os.path.dirname(instance.source_path))
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
|
||||
|
||||
def validate_move(instance, old_path, new_path):
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
return False
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@@ -182,51 +216,91 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not instance.filename:
|
||||
# Can't update the filename if there is not filename to begin with
|
||||
# This happens after the consumer creates a new document.
|
||||
# The PK needs to be set first by saving the document once. When this
|
||||
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
|
||||
# renamed anyway. In all other cases, instance.filename will be set.
|
||||
# Can't update the filename if there is no filename to begin with
|
||||
# This happens when the consumer creates a new document.
|
||||
# The document is modified and saved multiple times, and only after
|
||||
# everything is done (i.e., the generated filename is final),
|
||||
# filename will be set to the location where the consumer has put
|
||||
# the file.
|
||||
#
|
||||
# This will in turn cause this logic to move the file where it belongs.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
old_path = instance.source_path
|
||||
new_filename = generate_filename(instance)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
old_source_path = instance.source_path
|
||||
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
if not validate_move(instance, old_source_path, new_source_path):
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return
|
||||
# archive files are optional, archive checksum tells us if we have one,
|
||||
# since this is None for documents without archived files.
|
||||
if instance.archive_checksum:
|
||||
new_archive_filename = archive_name_from_filename(new_filename)
|
||||
old_archive_path = instance.archive_path
|
||||
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
|
||||
new_archive_filename)
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
if not validate_move(instance, old_archive_path, new_archive_path):
|
||||
return
|
||||
|
||||
create_source_path_directory(new_archive_path)
|
||||
else:
|
||||
old_archive_path = None
|
||||
new_archive_path = None
|
||||
|
||||
create_source_path_directory(new_source_path)
|
||||
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
os.rename(old_source_path, new_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(old_archive_path, new_archive_path)
|
||||
instance.filename = new_filename
|
||||
instance.save()
|
||||
# Don't save here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(filename=new_filename)
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_source_path} to {new_source_path}.")
|
||||
|
||||
if instance.archive_checksum:
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_archive_path} to {new_archive_path}.")
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
# this happens when we can't move a file. If that's the case for the
|
||||
# archive file, we try our best to revert the changes.
|
||||
try:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
except Exception as e:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also manage
|
||||
# to move it from B to A. If not, we have a serious issue
|
||||
# that's going to get caught by the santiy checker.
|
||||
# all files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed anyway.
|
||||
pass
|
||||
except DatabaseError as e:
|
||||
os.rename(new_path, old_path)
|
||||
os.rename(new_source_path, old_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
instance.filename = old_filename
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
delete_empty_directories(os.path.dirname(old_path))
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
|
||||
if old_archive_path and not os.path.isfile(old_archive_path):
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
@@ -3,15 +3,18 @@ import logging
|
||||
from django.conf import settings
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents import index, sanity_checker
|
||||
from documents.classifier import DocumentClassifier, \
|
||||
IncompatibleClassifierVersionError
|
||||
from documents.consumer import Consumer, ConsumerError
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import SanityFailedError
|
||||
|
||||
|
||||
def index_optimize():
|
||||
index.open_index().optimize()
|
||||
ix = index.open_index()
|
||||
writer = AsyncWriter(ix)
|
||||
writer.commit(optimize=True)
|
||||
|
||||
|
||||
def index_reindex():
|
||||
@@ -74,3 +77,12 @@ def consume_file(path,
|
||||
else:
|
||||
raise ConsumerError("Unknown error: Returned document was null, but "
|
||||
"no error message was given.")
|
||||
|
||||
|
||||
def sanity_check():
|
||||
messages = sanity_checker.check_sanity()
|
||||
|
||||
if len(messages) > 0:
|
||||
raise SanityFailedError(messages)
|
||||
else:
|
||||
return "No issues detected."
|
||||
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
BIN
src/documents/tests/samples/documents/archive/0000001.pdf
Normal file
BIN
src/documents/tests/samples/documents/originals/0000002.pdf.gpg
Normal file
BIN
src/documents/tests/samples/documents/thumbnails/0000001.png
Normal file
After Width: | Height: | Size: 7.7 KiB |
BIN
src/documents/tests/samples/documents/thumbnails/0000002.png.gpg
Normal file
BIN
src/documents/tests/samples/simple.pdf
Normal file
BIN
src/documents/tests/samples/simple.zip
Normal file
@@ -1,40 +1,25 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.test import override_settings
|
||||
from pathvalidate import ValidationError
|
||||
from rest_framework.test import APITestCase
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
from documents.models import Document, Correspondent, DocumentType, Tag
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class DocumentApiTest(APITestCase):
|
||||
class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.originals_dir = os.path.join(self.media_dir, "documents", "originals")
|
||||
self.thumbnail_dir = os.path.join(self.media_dir, "documents", "thumbnails")
|
||||
|
||||
os.makedirs(self.originals_dir, exist_ok=True)
|
||||
os.makedirs(self.thumbnail_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumbnail_dir
|
||||
).enable()
|
||||
super(TestDocumentApi, self).setUp()
|
||||
|
||||
user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_login(user=user)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
|
||||
def testDocuments(self):
|
||||
|
||||
response = self.client.get("/api/documents/").data
|
||||
@@ -56,20 +41,13 @@ class DocumentApiTest(APITestCase):
|
||||
returned_doc = response.data['results'][0]
|
||||
self.assertEqual(returned_doc['id'], doc.id)
|
||||
self.assertEqual(returned_doc['title'], doc.title)
|
||||
self.assertEqual(returned_doc['correspondent']['name'], c.name)
|
||||
self.assertEqual(returned_doc['document_type']['name'], dt.name)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], c.id)
|
||||
self.assertEqual(returned_doc['document_type']['id'], dt.id)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
|
||||
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
|
||||
self.assertEqual(len(returned_doc['tags']), 1)
|
||||
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
|
||||
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
|
||||
self.assertListEqual(returned_doc['tags_id'], [tag.id])
|
||||
self.assertEqual(returned_doc['correspondent'], c.id)
|
||||
self.assertEqual(returned_doc['document_type'], dt.id)
|
||||
self.assertListEqual(returned_doc['tags'], [tag.id])
|
||||
|
||||
c2 = Correspondent.objects.create(name="c2")
|
||||
|
||||
returned_doc['correspondent_id'] = c2.pk
|
||||
returned_doc['correspondent'] = c2.pk
|
||||
returned_doc['title'] = "the new title"
|
||||
|
||||
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
|
||||
@@ -87,7 +65,7 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
def test_document_actions(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.originals_dir)
|
||||
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_thumbnail = b"thumbnail content"
|
||||
@@ -97,7 +75,7 @@ class DocumentApiTest(APITestCase):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename(filename), mime_type="application/pdf")
|
||||
|
||||
with open(os.path.join(self.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
with open(os.path.join(self.dirs.thumbnail_dir, "{:07d}.png".format(doc.pk)), "wb") as f:
|
||||
f.write(content_thumbnail)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
@@ -115,6 +93,44 @@ class DocumentApiTest(APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_thumbnail)
|
||||
|
||||
def test_download_with_archive(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_archive = b"This is the same test but archived"
|
||||
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
filename = os.path.basename(filename)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=filename,
|
||||
mime_type="application/pdf")
|
||||
|
||||
with open(doc.archive_path, "wb") as f:
|
||||
f.write(content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
|
||||
@@ -179,6 +195,109 @@ class DocumentApiTest(APITestCase):
|
||||
results = response.data['results']
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
def test_search_no_query(self):
|
||||
response = self.client.get("/api/search/")
|
||||
results = response.data['results']
|
||||
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_search(self):
|
||||
d1=Document.objects.create(title="invoice", content="the thing i bought at a shop and paid with bank account", checksum="A", pk=1)
|
||||
d2=Document.objects.create(title="bank statement 1", content="things i paid for in august", pk=2, checksum="B")
|
||||
d3=Document.objects.create(title="bank statement 3", content="things i paid for in september", pk=3, checksum="C")
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
# Note to future self: there is a reason we dont use a model signal handler to update the index: some operations edit many documents at once
|
||||
# (retagger, renamer) and we don't want to open a writer for each of these, but rather perform the entire operation with one writer.
|
||||
# That's why we cant open the writer in a model on_save handler or something.
|
||||
index.update_document(writer, d1)
|
||||
index.update_document(writer, d2)
|
||||
index.update_document(writer, d3)
|
||||
response = self.client.get("/api/search/?query=bank")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 3)
|
||||
self.assertEqual(response.data['page'], 1)
|
||||
self.assertEqual(response.data['page_count'], 1)
|
||||
self.assertEqual(len(results), 3)
|
||||
|
||||
response = self.client.get("/api/search/?query=september")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 1)
|
||||
self.assertEqual(response.data['page'], 1)
|
||||
self.assertEqual(response.data['page_count'], 1)
|
||||
self.assertEqual(len(results), 1)
|
||||
|
||||
response = self.client.get("/api/search/?query=statement")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 2)
|
||||
self.assertEqual(response.data['page'], 1)
|
||||
self.assertEqual(response.data['page_count'], 1)
|
||||
self.assertEqual(len(results), 2)
|
||||
|
||||
response = self.client.get("/api/search/?query=sfegdfg")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 0)
|
||||
self.assertEqual(response.data['page'], 0)
|
||||
self.assertEqual(response.data['page_count'], 0)
|
||||
self.assertEqual(len(results), 0)
|
||||
|
||||
def test_search_multi_page(self):
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
|
||||
index.update_document(writer, doc)
|
||||
|
||||
# This is here so that we test that no document gets returned twice (might happen if the paging is not working)
|
||||
seen_ids = []
|
||||
|
||||
for i in range(1, 6):
|
||||
response = self.client.get(f"/api/search/?query=content&page={i}")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 55)
|
||||
self.assertEqual(response.data['page'], i)
|
||||
self.assertEqual(response.data['page_count'], 6)
|
||||
self.assertEqual(len(results), 10)
|
||||
|
||||
for result in results:
|
||||
self.assertNotIn(result['id'], seen_ids)
|
||||
seen_ids.append(result['id'])
|
||||
|
||||
response = self.client.get(f"/api/search/?query=content&page=6")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 55)
|
||||
self.assertEqual(response.data['page'], 6)
|
||||
self.assertEqual(response.data['page_count'], 6)
|
||||
self.assertEqual(len(results), 5)
|
||||
|
||||
for result in results:
|
||||
self.assertNotIn(result['id'], seen_ids)
|
||||
seen_ids.append(result['id'])
|
||||
|
||||
response = self.client.get(f"/api/search/?query=content&page=7")
|
||||
results = response.data['results']
|
||||
self.assertEqual(response.data['count'], 55)
|
||||
self.assertEqual(response.data['page'], 6)
|
||||
self.assertEqual(response.data['page_count'], 6)
|
||||
self.assertEqual(len(results), 5)
|
||||
|
||||
def test_search_invalid_page(self):
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(15):
|
||||
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content="content")
|
||||
index.update_document(writer, doc)
|
||||
|
||||
first_page = self.client.get(f"/api/search/?query=content&page=1").data
|
||||
second_page = self.client.get(f"/api/search/?query=content&page=2").data
|
||||
should_be_first_page_1 = self.client.get(f"/api/search/?query=content&page=0").data
|
||||
should_be_first_page_2 = self.client.get(f"/api/search/?query=content&page=dgfd").data
|
||||
should_be_first_page_3 = self.client.get(f"/api/search/?query=content&page=").data
|
||||
should_be_first_page_4 = self.client.get(f"/api/search/?query=content&page=-7868").data
|
||||
|
||||
self.assertDictEqual(first_page, should_be_first_page_1)
|
||||
self.assertDictEqual(first_page, should_be_first_page_2)
|
||||
self.assertDictEqual(first_page, should_be_first_page_3)
|
||||
self.assertDictEqual(first_page, should_be_first_page_4)
|
||||
self.assertNotEqual(len(first_page['results']), len(second_page['results']))
|
||||
|
||||
@mock.patch("documents.index.autocomplete")
|
||||
def test_search_autocomplete(self, m):
|
||||
m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
|
||||
@@ -201,6 +320,22 @@ class DocumentApiTest(APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 10)
|
||||
|
||||
def test_search_spelling_correction(self):
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
|
||||
index.update_document(writer, doc)
|
||||
|
||||
response = self.client.get("/api/search/?query=thing")
|
||||
correction = response.data['corrected_query']
|
||||
|
||||
self.assertEqual(correction, "things")
|
||||
|
||||
response = self.client.get("/api/search/?query=things")
|
||||
correction = response.data['corrected_query']
|
||||
|
||||
self.assertEqual(correction, None)
|
||||
|
||||
def test_statistics(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
@@ -215,3 +350,128 @@ class DocumentApiTest(APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.data['documents_total'], 3)
|
||||
self.assertEqual(response.data['documents_inbox'], 1)
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f})
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
self.assertEqual(kwargs['override_filename'], "simple.pdf")
|
||||
self.assertIsNone(kwargs['override_title'])
|
||||
self.assertIsNone(kwargs['override_correspondent_id'])
|
||||
self.assertIsNone(kwargs['override_document_type_id'])
|
||||
self.assertIsNone(kwargs['override_tag_ids'])
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_invalid_form(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"documenst": f})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_invalid_file(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
@mock.patch("documents.serialisers.validate_filename")
|
||||
def test_upload_invalid_filename(self, validate_filename, async_task):
|
||||
validate_filename.side_effect = ValidationError()
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_title(self, async_task):
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "title": "my custom title"})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertEqual(kwargs['override_title'], "my custom title")
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_correspondent(self, async_task):
|
||||
c = Correspondent.objects.create(name="test-corres")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": c.id})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertEqual(kwargs['override_correspondent_id'], c.id)
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_invalid_correspondent(self, async_task):
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": 3456})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_document_type(self, async_task):
|
||||
dt = DocumentType.objects.create(name="invoice")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": dt.id})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertEqual(kwargs['override_document_type_id'], dt.id)
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_invalid_document_type(self, async_task):
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": 34578})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_tags(self, async_task):
|
||||
t1 = Tag.objects.create(name="tag1")
|
||||
t2 = Tag.objects.create(name="tag2")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post(
|
||||
"/api/documents/post_document/",
|
||||
{"document": f, "tags": [t2.id, t1.id]})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id])
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_invalid_tags(self, async_task):
|
||||
t1 = Tag.objects.create(name="tag1")
|
||||
t2 = Tag.objects.create(name="tag2")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post(
|
||||
"/api/documents/post_document/",
|
||||
{"document": f, "tags": [t2.id, t1.id, 734563]})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
@@ -1,24 +1,29 @@
|
||||
import tempfile
|
||||
from time import sleep
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from documents.models import Correspondent, Document, Tag, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestClassifier(TestCase):
|
||||
class TestClassifier(DirectoriesMixin, TestCase):
|
||||
|
||||
def setUp(self):
|
||||
|
||||
super(TestClassifier, self).setUp()
|
||||
self.classifier = DocumentClassifier()
|
||||
|
||||
def generate_test_data(self):
|
||||
self.c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
self.c2 = Correspondent.objects.create(name="c2")
|
||||
self.c3 = Correspondent.objects.create(name="c3", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
self.t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
self.t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_ANY, pk=34, is_inbox_tag=True)
|
||||
self.t3 = Tag.objects.create(name="t3", matching_algorithm=Tag.MATCH_AUTO, pk=45)
|
||||
self.dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
self.dt2 = DocumentType.objects.create(name="dt2", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
self.doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=self.c1, checksum="A", document_type=self.dt)
|
||||
self.doc2 = Document.objects.create(title="doc1", content="this is another document, but from c2", correspondent=self.c2, checksum="B")
|
||||
@@ -59,8 +64,8 @@ class TestClassifier(TestCase):
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc1.content), self.c1.pk)
|
||||
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc1.content), (self.t1.pk,))
|
||||
self.assertTupleEqual(self.classifier.predict_tags(self.doc2.content), (self.t1.pk, self.t3.pk))
|
||||
self.assertListEqual(self.classifier.predict_tags(self.doc1.content), [self.t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(self.doc2.content), [self.t1.pk, self.t3.pk])
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc1.content), self.dt.pk)
|
||||
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
|
||||
|
||||
@@ -71,6 +76,44 @@ class TestClassifier(TestCase):
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
def testVersionIncreased(self):
|
||||
|
||||
self.generate_test_data()
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.assertFalse(self.classifier.train())
|
||||
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2 = DocumentClassifier()
|
||||
|
||||
current_ver = DocumentClassifier.FORMAT_VERSION
|
||||
with mock.patch("documents.classifier.DocumentClassifier.FORMAT_VERSION", current_ver+1):
|
||||
# assure that we won't load old classifiers.
|
||||
self.assertRaises(IncompatibleClassifierVersionError, classifier2.reload)
|
||||
|
||||
self.classifier.save_classifier()
|
||||
|
||||
# assure that we can load the classifier after saving it.
|
||||
classifier2.reload()
|
||||
|
||||
def testReload(self):
|
||||
|
||||
self.generate_test_data()
|
||||
self.assertTrue(self.classifier.train())
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2 = DocumentClassifier()
|
||||
classifier2.reload()
|
||||
v1 = classifier2.classifier_version
|
||||
|
||||
# change the classifier after some time.
|
||||
sleep(1)
|
||||
self.classifier.save_classifier()
|
||||
|
||||
classifier2.reload()
|
||||
v2 = classifier2.classifier_version
|
||||
self.assertNotEqual(v1, v2)
|
||||
|
||||
@override_settings(DATA_DIR=tempfile.mkdtemp())
|
||||
def testSaveClassifier(self):
|
||||
|
||||
@@ -83,3 +126,112 @@ class TestClassifier(TestCase):
|
||||
new_classifier = DocumentClassifier()
|
||||
new_classifier.reload()
|
||||
self.assertFalse(new_classifier.train())
|
||||
|
||||
def test_one_correspondent_predict(self):
|
||||
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
|
||||
|
||||
def test_one_correspondent_predict_manydocs(self):
|
||||
c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO)
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A")
|
||||
doc2 = Document.objects.create(title="doc2", content="this is a document from noone", checksum="B")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_correspondent(doc1.content), c1.pk)
|
||||
self.assertIsNone(self.classifier.predict_correspondent(doc2.content))
|
||||
|
||||
def test_one_type_predict(self):
|
||||
dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
|
||||
checksum="A", document_type=dt)
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
|
||||
|
||||
def test_one_type_predict_manydocs(self):
|
||||
dt = DocumentType.objects.create(name="dt", matching_algorithm=DocumentType.MATCH_AUTO)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1",
|
||||
checksum="A", document_type=dt)
|
||||
|
||||
doc2 = Document.objects.create(title="doc1", content="this is a document from c2",
|
||||
checksum="B")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertEqual(self.classifier.predict_document_type(doc1.content), dt.pk)
|
||||
self.assertIsNone(self.classifier.predict_document_type(doc2.content))
|
||||
|
||||
def test_one_tag_predict(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
|
||||
def test_one_tag_predict_unassigned(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [])
|
||||
|
||||
def test_two_tags_predict_singledoc(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
|
||||
|
||||
doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
|
||||
|
||||
doc4.tags.add(t1)
|
||||
doc4.tags.add(t2)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
|
||||
|
||||
def test_two_tags_predict(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO, pk=121)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
doc2 = Document.objects.create(title="doc1", content="this is a document from c2", checksum="B")
|
||||
doc3 = Document.objects.create(title="doc1", content="this is a document from c3", checksum="C")
|
||||
doc4 = Document.objects.create(title="doc1", content="this is a document from c4", checksum="D")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
doc2.tags.add(t2)
|
||||
|
||||
doc4.tags.add(t1)
|
||||
doc4.tags.add(t2)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [t2.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc3.content), [])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc4.content), [t1.pk, t2.pk])
|
||||
|
||||
def test_one_tag_predict_multi(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
doc2.tags.add(t1)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [t1.pk])
|
||||
|
||||
def test_one_tag_predict_multi_2(self):
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO, pk=12)
|
||||
|
||||
doc1 = Document.objects.create(title="doc1", content="this is a document from c1", checksum="A")
|
||||
doc2 = Document.objects.create(title="doc2", content="this is a document from c2", checksum="B")
|
||||
|
||||
doc1.tags.add(t1)
|
||||
self.classifier.train()
|
||||
self.assertListEqual(self.classifier.predict_tags(doc1.content), [t1.pk])
|
||||
self.assertListEqual(self.classifier.predict_tags(doc2.content), [])
|
||||
|
@@ -7,6 +7,7 @@ from unittest.mock import MagicMock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..consumer import Consumer, ConsumerError
|
||||
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
||||
from ..parsers import DocumentParser, ParseError
|
||||
@@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
class DummyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(DummyParser, self).__init__(path, logging_group)
|
||||
def __init__(self, logging_group, scratch_dir, archive_path):
|
||||
super(DummyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
return "The Text"
|
||||
def parse(self, document_path, mime_type):
|
||||
self.text = "The Text"
|
||||
|
||||
|
||||
class FaultyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(path, logging_group)
|
||||
def __init__(self, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
def parse(self, document_path, mime_type):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
@@ -408,32 +410,22 @@ def fake_magic_from_file(file, mime=False):
|
||||
|
||||
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(TestCase):
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
return DummyParser(path, logging_group, self.scratch_dir)
|
||||
def make_dummy_parser(self, logging_group):
|
||||
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
|
||||
|
||||
def make_faulty_parser(self, path, logging_group):
|
||||
return FaultyParser(path, logging_group, self.scratch_dir)
|
||||
def make_faulty_parser(self, logging_group):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
self.scratch_dir = tempfile.mkdtemp()
|
||||
self.media_dir = tempfile.mkdtemp()
|
||||
self.consumption_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch_dir,
|
||||
MEDIA_ROOT=self.media_dir,
|
||||
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
|
||||
CONSUMPTION_DIR=self.consumption_dir
|
||||
).enable()
|
||||
super(TestConsumer, self).setUp()
|
||||
|
||||
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
m = patcher.start()
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_dummy_parser,
|
||||
"mime_types": ["application/pdf"],
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
@@ -441,15 +433,19 @@ class TestConsumer(TestCase):
|
||||
|
||||
self.consumer = Consumer()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.media_dir, ignore_errors=True)
|
||||
shutil.rmtree(self.consumption_dir, ignore_errors=True)
|
||||
|
||||
def get_test_file(self):
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
|
||||
return f
|
||||
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
|
||||
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
def get_test_archive_file(self):
|
||||
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
|
||||
dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def testNormalOperation(self):
|
||||
|
||||
filename = self.get_test_file()
|
||||
@@ -469,6 +465,13 @@ class TestConsumer(TestCase):
|
||||
document.thumbnail_path
|
||||
))
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.archive_path
|
||||
))
|
||||
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
|
||||
self.assertFalse(os.path.isfile(filename))
|
||||
|
||||
def testOverrideFilename(self):
|
||||
@@ -516,27 +519,7 @@ class TestConsumer(TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR=None)
|
||||
def testConsumptionDirUnset(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="asd")
|
||||
def testNoConsumptionDir(self):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertEqual(str(e), "Consumption directory asd does not exist")
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates(self):
|
||||
def testDuplicates1(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
@@ -547,6 +530,21 @@ class TestConsumer(TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates2(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates3(self):
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testNoParsers(self, m):
|
||||
m.return_value = []
|
||||
@@ -554,7 +552,7 @@ class TestConsumer(TestCase):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).startswith("No parsers abvailable"))
|
||||
self.assertTrue("No parsers abvailable for" in str(e))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
@@ -563,7 +561,7 @@ class TestConsumer(TestCase):
|
||||
def testFaultyParser(self, m):
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_faulty_parser,
|
||||
"mime_types": ["application/pdf"],
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
@@ -598,12 +596,33 @@ class TestConsumer(TestCase):
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||
|
||||
print(document.source_path)
|
||||
print("===")
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertEqual(document.filename, "Bank/new docs-0000001.pdf")
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.generate_filename")
|
||||
def testFilenameHandlingUnstableFormat(self, m):
|
||||
|
||||
filenames = ["this", "that", "now this", "i cant decide"]
|
||||
|
||||
def get_filename():
|
||||
f = filenames.pop()
|
||||
filenames.insert(0, f)
|
||||
return f
|
||||
|
||||
m.side_effect = lambda f: get_filename()
|
||||
|
||||
filename = self.get_test_file()
|
||||
|
||||
Tag.objects.create(name="test", is_inbox_tag=True)
|
||||
|
||||
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
||||
|
||||
self.assertEqual(document.title, "new docs")
|
||||
self.assertEqual(document.correspondent.name, "Bank")
|
||||
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
||||
self.assertIsNotNone(os.path.isfile(document.title))
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
|
||||
@mock.patch("documents.consumer.DocumentClassifier")
|
||||
def testClassifyDocument(self, m):
|
||||
|
140
src/documents/tests/test_date_parsing.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
|
||||
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
|
||||
def setUp(self):
|
||||
os.makedirs(self.SCRATCH, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.SCRATCH)
|
||||
|
||||
def test_date_format_1(self):
|
||||
text = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_2(self):
|
||||
text = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_3(self):
|
||||
text = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_4(self):
|
||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_5(self):
|
||||
text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_6(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_7(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_8(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_9(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_crazy_date_past(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
|
||||
|
||||
def test_crazy_date_future(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
|
||||
|
||||
def test_crazy_date_with_spaces(self, *args):
|
||||
self.assertIsNone(parse_date("", "20 408000l 2475"))
|
||||
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
@@ -1,12 +1,29 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class TestDocument(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.originals_dir = tempfile.mkdtemp()
|
||||
self.thumb_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumb_dir,
|
||||
).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.originals_dir)
|
||||
shutil.rmtree(self.thumb_dir)
|
||||
|
||||
def test_file_deletion(self):
|
||||
document = Document.objects.create(
|
||||
correspondent=Correspondent.objects.create(name="Test0"),
|
||||
@@ -19,8 +36,31 @@ class TestDocument(TestCase):
|
||||
file_path = document.source_path
|
||||
thumb_path = document.thumbnail_path
|
||||
|
||||
Path(file_path).touch()
|
||||
Path(thumb_path).touch()
|
||||
|
||||
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
|
||||
document.delete()
|
||||
mock_unlink.assert_any_call(file_path)
|
||||
mock_unlink.assert_any_call(thumb_path)
|
||||
self.assertEqual(mock_unlink.call_count, 2)
|
||||
|
||||
def test_file_name(self):
|
||||
|
||||
doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.pdf")
|
||||
|
||||
def test_file_name_jpg(self):
|
||||
|
||||
doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.jpg")
|
||||
|
||||
def test_file_name_unknown(self):
|
||||
|
||||
doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test.zip")
|
||||
|
||||
def test_file_name_invalid_type(self):
|
||||
|
||||
doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
|
||||
|
@@ -1,32 +1,18 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
|
||||
from ..models import Document, Correspondent
|
||||
from ..signals.handlers import update_filename_and_move_files
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
deletion_list = []
|
||||
|
||||
def add_to_deletion_list(self, dirname):
|
||||
self.deletion_list.append(dirname)
|
||||
|
||||
def setUp(self):
|
||||
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(folder + "/documents/originals")
|
||||
override_settings(MEDIA_ROOT=folder).enable()
|
||||
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
|
||||
self.add_to_deletion_list(folder)
|
||||
|
||||
def tearDown(self):
|
||||
for dirname in self.deletion_list:
|
||||
shutil.rmtree(dirname, ignore_errors=True)
|
||||
class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
@@ -103,7 +89,7 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
|
||||
@@ -133,18 +119,14 @@ class TestDate(TestCase):
|
||||
document.correspondent = Correspondent.objects.get_or_create(
|
||||
name="test")[0]
|
||||
|
||||
# This will cause save() to fail.
|
||||
document.checksum = document1.checksum
|
||||
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
document.save()
|
||||
|
||||
# Assume saving the document initially works, this gets called.
|
||||
# After renaming, an error occurs, and filename is not saved:
|
||||
# document should still be available at document.filename.
|
||||
update_filename_and_move_files(None, document)
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
def test_document_delete(self):
|
||||
@@ -199,8 +181,8 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
|
||||
self.assertTrue(os.path.isfile(important_file))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
@@ -318,13 +300,12 @@ class TestDate(TestCase):
|
||||
# Create our working directory
|
||||
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
os.makedirs(os.path.join(tmp, "notempty"))
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
os.makedirs(os.path.join(tmp, "notempty", "empty"))
|
||||
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
@@ -348,3 +329,179 @@ class TestDate(TestCase):
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
|
||||
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def test_create_no_format(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_create_with_format(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
#Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_exists(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
|
||||
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.os.rename")
|
||||
def test_move_archive_error(self, m):
|
||||
|
||||
def fake_rename(src, dst):
|
||||
if "archive" in src:
|
||||
raise OSError()
|
||||
else:
|
||||
os.remove(src)
|
||||
Path(dst).touch()
|
||||
|
||||
m.side_effect = fake_rename
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_file_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
#Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.os.rename")
|
||||
def test_move_file_error(self, m):
|
||||
|
||||
def fake_rename(src, dst):
|
||||
if "original" in src:
|
||||
raise OSError()
|
||||
else:
|
||||
os.remove(src)
|
||||
Path(dst).touch()
|
||||
|
||||
m.side_effect = fake_rename
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
def test_archive_deleted(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
doc.delete()
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_database_error(self):
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
doc.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
class TestFilenameGeneration(TestCase):
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{title}"
|
||||
)
|
||||
def test_invalid_characters(self):
|
||||
|
||||
doc = Document.objects.create(title="This. is the title.", mime_type="application/pdf", pk=1, checksum="1")
|
||||
self.assertEqual(generate_filename(doc), "This. is the title-0000001.pdf")
|
||||
|
||||
doc = Document.objects.create(title="my\\invalid/../title:yay", mime_type="application/pdf", pk=2, checksum="2")
|
||||
self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay-0000002.pdf")
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{created}"
|
||||
)
|
||||
def test_date(self):
|
||||
doc = Document.objects.create(title="does not matter", created=datetime.datetime(2020,5,21, 7,36,51, 153), mime_type="application/pdf", pk=2, checksum="2")
|
||||
self.assertEqual(generate_filename(doc), "2020-05-21-0000002.pdf")
|
||||
|
@@ -2,7 +2,7 @@ import logging
|
||||
import uuid
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Log
|
||||
|
||||
@@ -14,6 +14,7 @@ class TestPaperlessLog(TestCase):
|
||||
self.logger = logging.getLogger(
|
||||
"documents.management.commands.document_consumer")
|
||||
|
||||
@override_settings(DISABLE_DBHANDLER=False)
|
||||
def test_that_it_saves_at_all(self):
|
||||
|
||||
kw = {"group": uuid.uuid4()}
|
||||
@@ -38,6 +39,7 @@ class TestPaperlessLog(TestCase):
|
||||
self.logger.critical("This is a critical message", extra=kw)
|
||||
self.assertEqual(Log.objects.all().count(), 5)
|
||||
|
||||
@override_settings(DISABLE_DBHANDLER=False)
|
||||
def test_groups(self):
|
||||
|
||||
kw1 = {"group": uuid.uuid4()}
|
||||
|
42
src/documents/tests/test_management_archiver.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.management.commands.document_archiver import handle_document
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
|
||||
class TestArchiver(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
|
||||
#self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
#self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
|
||||
def test_archiver(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
call_command('document_archiver')
|
||||
|
||||
def test_handle_document(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
handle_document(self.d1.pk)
|
||||
|
||||
doc = Document.objects.get(id=self.d1.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
|
262
src/documents/tests/test_management_consumer.py
Normal file
@@ -0,0 +1,262 @@
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
from threading import Thread
|
||||
from time import sleep
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management import call_command, CommandError
|
||||
from django.test import override_settings, TransactionTestCase
|
||||
|
||||
from documents.models import Tag
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.management.commands import document_consumer
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class ConsumerThread(Thread):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.cmd = document_consumer.Command()
|
||||
|
||||
def run(self) -> None:
|
||||
self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False)
|
||||
|
||||
def stop(self):
|
||||
# Consumer checks this every second.
|
||||
self.cmd.stop_flag = True
|
||||
|
||||
|
||||
def chunked(size, source):
|
||||
for i in range(0, len(source), size):
|
||||
yield source[i:i+size]
|
||||
|
||||
|
||||
class ConsumerMixin:
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(ConsumerMixin, self).setUp()
|
||||
self.t = None
|
||||
patcher = mock.patch("documents.management.commands.document_consumer.async_task")
|
||||
self.task_mock = patcher.start()
|
||||
self.addCleanup(patcher.stop)
|
||||
|
||||
def t_start(self):
|
||||
self.t = ConsumerThread()
|
||||
self.t.start()
|
||||
# give the consumer some time to do initial work
|
||||
sleep(1)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
if self.t:
|
||||
# set the stop flag
|
||||
self.t.stop()
|
||||
# wait for the consumer to exit.
|
||||
self.t.join()
|
||||
|
||||
super(ConsumerMixin, self).tearDown()
|
||||
|
||||
def wait_for_task_mock_call(self):
|
||||
n = 0
|
||||
while n < 100:
|
||||
if self.task_mock.call_count > 0:
|
||||
# give task_mock some time to finish and raise errors
|
||||
sleep(1)
|
||||
return
|
||||
n += 1
|
||||
sleep(0.1)
|
||||
|
||||
# A bogus async_task that will simply check the file for
|
||||
# completeness and raise an exception otherwise.
|
||||
def bogus_task(self, func, filename, **kwargs):
|
||||
eq = filecmp.cmp(filename, self.sample_file, shallow=False)
|
||||
if not eq:
|
||||
print("Consumed an INVALID file.")
|
||||
raise ConsumerError("Incomplete File READ FAILED")
|
||||
else:
|
||||
print("Consumed a perfectly valid file.")
|
||||
|
||||
def slow_write_file(self, target, incomplete=False):
|
||||
with open(self.sample_file, 'rb') as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
if incomplete:
|
||||
pdf_bytes = pdf_bytes[:len(pdf_bytes) - 100]
|
||||
|
||||
with open(target, 'wb') as f:
|
||||
# this will take 2 seconds, since the file is about 20k.
|
||||
print("Start writing file.")
|
||||
for b in chunked(1000, pdf_bytes):
|
||||
f.write(b)
|
||||
sleep(0.1)
|
||||
print("file completed.")
|
||||
|
||||
|
||||
class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
def test_consume_file(self):
|
||||
self.t_start()
|
||||
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
def test_consume_file_invalid_ext(self):
|
||||
self.t_start()
|
||||
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.wow")
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_not_called()
|
||||
|
||||
def test_consume_existing_file(self):
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.t_start()
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_pdf(self, error_logger):
|
||||
|
||||
self.task_mock.side_effect = self.bogus_task
|
||||
|
||||
self.t_start()
|
||||
|
||||
fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
|
||||
self.slow_write_file(fname)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
error_logger.assert_not_called()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname)
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_and_move(self, error_logger):
|
||||
|
||||
self.task_mock.side_effect = self.bogus_task
|
||||
|
||||
self.t_start()
|
||||
|
||||
fname = os.path.join(self.dirs.consumption_dir, "my_file.~df")
|
||||
fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
|
||||
self.slow_write_file(fname)
|
||||
shutil.move(fname, fname2)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname2)
|
||||
|
||||
error_logger.assert_not_called()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_incomplete(self, error_logger):
|
||||
|
||||
self.task_mock.side_effect = self.bogus_task
|
||||
|
||||
self.t_start()
|
||||
|
||||
fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
self.slow_write_file(fname, incomplete=True)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname)
|
||||
|
||||
# assert that we have an error logged with this invalid file.
|
||||
error_logger.assert_called_once()
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="does_not_exist")
|
||||
def test_consumption_directory_invalid(self):
|
||||
|
||||
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="")
|
||||
def test_consumption_directory_unset(self):
|
||||
|
||||
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
|
||||
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
class TestConsumerPolling(TestConsumer):
|
||||
# just do all the tests with polling
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
class TestConsumerRecursive(TestConsumer):
|
||||
# just do all the tests with recursive
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
class TestConsumerRecursivePolling(TestConsumer):
|
||||
# just do all the tests with polling and recursive
|
||||
pass
|
||||
|
||||
|
||||
class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
@override_settings(CONSUMER_SUBDIRS_AS_TAGS=True)
|
||||
def test_consume_file_with_path_tags(self):
|
||||
|
||||
tag_names = ("existingTag", "Space Tag")
|
||||
# Create a Tag prior to consuming a file using it in path
|
||||
tag_ids = [Tag.objects.create(name=tag_names[0]).pk,]
|
||||
|
||||
self.t_start()
|
||||
|
||||
path = os.path.join(self.dirs.consumption_dir, *tag_names)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
f = os.path.join(path, "my_file.pdf")
|
||||
# Wait at least inotify read_delay for recursive watchers
|
||||
# to be created for the new directories
|
||||
sleep(1)
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
# Add the pk of the Tag created by _consume()
|
||||
tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
# assertCountEqual has a bad name, but test that the first
|
||||
# sequence contains the same elements as second, regardless of
|
||||
# their order.
|
||||
self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_consume_file_with_path_tags_polling(self):
|
||||
self.test_consume_file_with_path_tags()
|
57
src/documents/tests/test_management_decrypt.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
|
||||
|
||||
class TestDecryptDocuments(TestCase):
|
||||
|
||||
@override_settings(
|
||||
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
|
||||
PASSPHRASE="test",
|
||||
PAPERLESS_FILENAME_FORMAT=None
|
||||
)
|
||||
@mock.patch("documents.management.commands.decrypt_documents.input")
|
||||
def test_decrypt(self, m):
|
||||
|
||||
media_dir = tempfile.mkdtemp()
|
||||
originals_dir = os.path.join(media_dir, "documents", "originals")
|
||||
thumb_dir = os.path.join(media_dir, "documents", "thumbnails")
|
||||
os.makedirs(originals_dir, exist_ok=True)
|
||||
os.makedirs(thumb_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=originals_dir,
|
||||
THUMBNAIL_DIR=thumb_dir,
|
||||
PASSPHRASE="test"
|
||||
).enable()
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000002.pdf.gpg"), os.path.join(originals_dir, "0000002.pdf.gpg"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000002.png.gpg"), os.path.join(thumb_dir, "0000002.png.gpg"))
|
||||
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
|
||||
call_command('decrypt_documents')
|
||||
|
||||
doc = Document.objects.get(id=2)
|
||||
|
||||
self.assertEqual(doc.storage_type, Document.STORAGE_TYPE_UNENCRYPTED)
|
||||
self.assertEqual(doc.filename, "0000002.pdf")
|
||||
self.assertTrue(os.path.isfile(os.path.join(originals_dir, "0000002.pdf")))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(os.path.join(thumb_dir, "0000002.png")))
|
||||
self.assertTrue(os.path.isfile(doc.thumbnail_path))
|
||||
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, doc.checksum)
|
||||
|
74
src/documents/tests/test_management_exporter.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
from documents.sanity_checker import check_sanity
|
||||
from documents.tests.utils import DirectoriesMixin, paperless_environment
|
||||
|
||||
|
||||
class TestExportImport(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(
|
||||
PASSPHRASE="test"
|
||||
)
|
||||
def test_exporter(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
|
||||
|
||||
Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Tag.objects.create(name="t")
|
||||
DocumentType.objects.create(name="dt")
|
||||
Correspondent.objects.create(name="c")
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
|
||||
call_command('document_exporter', target)
|
||||
|
||||
with open(os.path.join(target, "manifest.json")) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
self.assertEqual(len(manifest), 5)
|
||||
|
||||
for element in manifest:
|
||||
if element['model'] == 'documents.document':
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['checksum'])
|
||||
|
||||
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['archive_checksum'])
|
||||
|
||||
with paperless_environment() as dirs:
|
||||
call_command('document_importer', target)
|
||||
messages = check_sanity()
|
||||
# everything is alright after the test
|
||||
self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
|
||||
|
||||
def test_export_missing_files(self):
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
|
||||
|
||||
def test_duplicate_titles(self):
|
||||
# TODO
|
||||
pass
|
58
src/documents/tests/test_management_retagger.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document, Tag, Correspondent, DocumentType
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestRetagger(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document")
|
||||
self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
|
||||
self.tag_first = Tag.objects.create(name="tag1", match="first", matching_algorithm=Tag.MATCH_ANY)
|
||||
self.tag_second = Tag.objects.create(name="tag2", match="second", matching_algorithm=Tag.MATCH_ANY)
|
||||
|
||||
self.correspondent_first = Correspondent.objects.create(
|
||||
name="c1", match="first", matching_algorithm=Correspondent.MATCH_ANY)
|
||||
self.correspondent_second = Correspondent.objects.create(
|
||||
name="c2", match="second", matching_algorithm=Correspondent.MATCH_ANY)
|
||||
|
||||
self.doctype_first = DocumentType.objects.create(
|
||||
name="dt1", match="first", matching_algorithm=DocumentType.MATCH_ANY)
|
||||
self.doctype_second = DocumentType.objects.create(
|
||||
name="dt2", match="second", matching_algorithm=DocumentType.MATCH_ANY)
|
||||
|
||||
def get_updated_docs(self):
|
||||
return Document.objects.get(title="A"), Document.objects.get(title="B"), Document.objects.get(title="C")
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestRetagger, self).setUp()
|
||||
self.make_models()
|
||||
|
||||
def test_add_tags(self):
|
||||
call_command('document_retagger', '--tags')
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.tags.count(), 1)
|
||||
self.assertEqual(d_second.tags.count(), 1)
|
||||
self.assertEqual(d_unrelated.tags.count(), 0)
|
||||
|
||||
self.assertEqual(d_first.tags.first(), self.tag_first)
|
||||
self.assertEqual(d_second.tags.first(), self.tag_second)
|
||||
|
||||
def test_add_type(self):
|
||||
call_command('document_retagger', '--document_type')
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.document_type, self.doctype_first)
|
||||
self.assertEqual(d_second.document_type, self.doctype_second)
|
||||
|
||||
def test_add_correspondent(self):
|
||||
call_command('document_retagger', '--correspondent')
|
||||
d_first, d_second, d_unrelated = self.get_updated_docs()
|
||||
|
||||
self.assertEqual(d_first.correspondent, self.correspondent_first)
|
||||
self.assertEqual(d_second.correspondent, self.correspondent_second)
|
@@ -1,3 +1,5 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from random import randint
|
||||
|
||||
from django.contrib.admin.models import LogEntry
|
||||
@@ -215,6 +217,13 @@ class TestDocumentConsumptionFinishedSignal(TestCase):
|
||||
self.doc_contains = Document.objects.create(
|
||||
content="I contain the keyword.", mime_type="application/pdf")
|
||||
|
||||
self.index_dir = tempfile.mkdtemp()
|
||||
# TODO: we should not need the index here.
|
||||
override_settings(INDEX_DIR=self.index_dir).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.index_dir, ignore_errors=True)
|
||||
|
||||
def test_tag_applied_any(self):
|
||||
t1 = Tag.objects.create(
|
||||
name="test", match="keyword", matching_algorithm=Tag.MATCH_ANY)
|
||||
|
@@ -1,10 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import get_parser_class
|
||||
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
|
||||
get_parser_class_for_mime_type, DocumentParser, is_file_ext_supported
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
@@ -27,7 +32,7 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -45,8 +50,8 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -61,3 +66,57 @@ class TestParserDiscovery(TestCase):
|
||||
self.assertIsNone(
|
||||
get_parser_class("doc.pdf")
|
||||
)
|
||||
|
||||
|
||||
def fake_get_thumbnail(self, path, mimetype):
|
||||
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
||||
|
||||
|
||||
class TestBaseParser(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch
|
||||
).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=True)
|
||||
def test_get_optimised_thumbnail(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
parser.get_optimised_thumbnail("any", "not important")
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=False)
|
||||
def test_get_optimised_thumb_disabled(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
path = parser.get_optimised_thumbnail("any", "not important")
|
||||
self.assertEqual(path, fake_get_thumbnail(None, None, None))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
|
||||
def test_file_extensions(self):
|
||||
|
||||
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
|
||||
self.assertIn(ext, get_supported_file_extensions())
|
||||
self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
|
||||
self.assertEqual(get_default_file_extension('image/png'), ".png")
|
||||
self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
|
||||
self.assertEqual(get_default_file_extension('text/plain'), ".txt")
|
||||
self.assertEqual(get_default_file_extension('text/csv'), ".csv")
|
||||
self.assertEqual(get_default_file_extension('application/zip'), ".zip")
|
||||
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
|
||||
|
||||
self.assertTrue(is_file_ext_supported('.pdf'))
|
||||
self.assertFalse(is_file_ext_supported('.hsdfh'))
|
||||
|
56
src/documents/tests/test_post_consume_handlers.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.models import Document, Tag, Correspondent
|
||||
from documents.signals.handlers import run_post_consume_script
|
||||
|
||||
|
||||
class PostConsumeTestCase(TestCase):
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT=None)
|
||||
def test_no_post_consume_script(self, m):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
tag2 = Tag.objects.create(name="b")
|
||||
doc.tags.add(tag1)
|
||||
doc.tags.add(tag2)
|
||||
|
||||
run_post_consume_script(None, doc)
|
||||
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT="script")
|
||||
def test_post_consume_script_simple(self, m):
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf")
|
||||
|
||||
run_post_consume_script(None, doc)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT="script")
|
||||
def test_post_consume_script_with_correspondent(self, m):
|
||||
c = Correspondent.objects.create(name="my_bank")
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
tag2 = Tag.objects.create(name="b")
|
||||
doc.tags.add(tag1)
|
||||
doc.tags.add(tag2)
|
||||
|
||||
run_post_consume_script(None, doc)
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
|
||||
command = args[0]
|
||||
|
||||
self.assertEqual(command[0], "script")
|
||||
self.assertEqual(command[1], str(doc.pk))
|
||||
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
|
||||
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
|
||||
self.assertEqual(command[7], "my_bank")
|
||||
self.assertCountEqual(command[8].split(","), ["a", "b"])
|
87
src/documents/tests/test_sanity_check.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import check_sanity, SanityFailedError
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestSanityCheck(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_test_data(self):
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
|
||||
|
||||
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
|
||||
|
||||
def test_no_docs(self):
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
|
||||
def test_success(self):
|
||||
self.make_test_data()
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
|
||||
def test_no_thumbnail(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.thumbnail_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_thumbnail_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.thumbnail_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
os.chmod(doc.thumbnail_path, 0o777)
|
||||
|
||||
def test_no_original(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.source_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_original_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.source_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
os.chmod(doc.source_path, 0o777)
|
||||
|
||||
def test_original_checksum_mismatch(self):
|
||||
doc = self.make_test_data()
|
||||
doc.checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_no_archive(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.archive_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_archive_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.archive_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
os.chmod(doc.archive_path, 0o777)
|
||||
|
||||
def test_archive_checksum_mismatch(self):
|
||||
doc = self.make_test_data()
|
||||
doc.archive_checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_empty_content(self):
|
||||
doc = self.make_test_data()
|
||||
doc.content = ""
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_orphaned_file(self):
|
||||
doc = self.make_test_data()
|
||||
Path(self.dirs.originals_dir, "orphaned").touch()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_all(self):
|
||||
Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
|
||||
string = str(SanityFailedError(check_sanity()))
|
24
src/documents/tests/test_tasks.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from datetime import datetime
|
||||
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import tasks
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestTasks(DirectoriesMixin, TestCase):
|
||||
|
||||
def test_index_reindex(self):
|
||||
Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
|
||||
|
||||
tasks.index_reindex()
|
||||
|
||||
def test_index_optimize(self):
|
||||
Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
|
||||
|
||||
tasks.index_optimize()
|
||||
|
||||
def test_train_classifier(self):
|
||||
tasks.train_classifier()
|
76
src/documents/tests/utils.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
|
||||
def setup_directories():
|
||||
|
||||
dirs = namedtuple("Dirs", ())
|
||||
|
||||
dirs.data_dir = tempfile.mkdtemp()
|
||||
dirs.scratch_dir = tempfile.mkdtemp()
|
||||
dirs.media_dir = tempfile.mkdtemp()
|
||||
dirs.consumption_dir = tempfile.mkdtemp()
|
||||
dirs.index_dir = os.path.join(dirs.data_dir, "index")
|
||||
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
|
||||
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
|
||||
dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
|
||||
|
||||
os.makedirs(dirs.index_dir, exist_ok=True)
|
||||
os.makedirs(dirs.originals_dir, exist_ok=True)
|
||||
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
|
||||
os.makedirs(dirs.archive_dir, exist_ok=True)
|
||||
|
||||
dirs.settings_override = override_settings(
|
||||
DATA_DIR=dirs.data_dir,
|
||||
SCRATCH_DIR=dirs.scratch_dir,
|
||||
MEDIA_ROOT=dirs.media_dir,
|
||||
ORIGINALS_DIR=dirs.originals_dir,
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
|
||||
|
||||
)
|
||||
dirs.settings_override.enable()
|
||||
|
||||
return dirs
|
||||
|
||||
|
||||
def remove_dirs(dirs):
|
||||
shutil.rmtree(dirs.media_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
dirs.settings_override.disable()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def paperless_environment():
|
||||
dirs = None
|
||||
try:
|
||||
dirs = setup_directories()
|
||||
yield dirs
|
||||
finally:
|
||||
if dirs:
|
||||
remove_dirs(dirs)
|
||||
|
||||
|
||||
class DirectoriesMixin:
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.dirs = None
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.dirs = setup_directories()
|
||||
super(DirectoriesMixin, self).setUp()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super(DirectoriesMixin, self).tearDown()
|
||||
remove_dirs(self.dirs)
|
@@ -1,8 +1,16 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
from django.conf import settings
|
||||
from django.db.models import Count, Max
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
from django.views.decorators.cache import cache_control
|
||||
from django.views.generic import TemplateView
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
from django_q.tasks import async_task
|
||||
from rest_framework import parsers
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.filters import OrderingFilter, SearchFilter
|
||||
from rest_framework.mixins import (
|
||||
@@ -30,14 +38,14 @@ from .filters import (
|
||||
DocumentTypeFilterSet,
|
||||
LogFilterSet
|
||||
)
|
||||
from .forms import UploadForm
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||
from .serialisers import (
|
||||
CorrespondentSerializer,
|
||||
DocumentSerializer,
|
||||
LogSerializer,
|
||||
TagSerializer,
|
||||
DocumentTypeSerializer
|
||||
DocumentTypeSerializer,
|
||||
PostDocumentSerializer
|
||||
)
|
||||
|
||||
|
||||
@@ -126,36 +134,54 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
index.remove_document_from_index(self.get_object())
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
@staticmethod
|
||||
def original_requested(request):
|
||||
return (
|
||||
'original' in request.query_params and
|
||||
request.query_params['original'] == 'true'
|
||||
)
|
||||
|
||||
def file_response(self, pk, request, disposition):
|
||||
doc = Document.objects.get(id=pk)
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
file_handle = doc.source_file
|
||||
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
|
||||
file_handle = doc.archive_file
|
||||
filename = doc.get_public_filename(archive=True)
|
||||
mime_type = 'application/pdf'
|
||||
else:
|
||||
file_handle = GnuPG.decrypted(doc.source_file)
|
||||
file_handle = doc.source_file
|
||||
filename = doc.get_public_filename()
|
||||
mime_type = doc.mime_type
|
||||
|
||||
response = HttpResponse(file_handle, content_type=doc.mime_type)
|
||||
if doc.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
file_handle = GnuPG.decrypted(file_handle)
|
||||
|
||||
response = HttpResponse(file_handle, content_type=mime_type)
|
||||
response["Content-Disposition"] = '{}; filename="{}"'.format(
|
||||
disposition, doc.file_name)
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
@action(methods=['post'], detail=False)
|
||||
def post_document(self, request, pk=None):
|
||||
# TODO: is this a good implementation?
|
||||
form = UploadForm(data=request.POST, files=request.FILES)
|
||||
if form.is_valid():
|
||||
form.save()
|
||||
return Response("OK")
|
||||
else:
|
||||
return HttpResponseBadRequest(str(form.errors))
|
||||
@action(methods=['get'], detail=True)
|
||||
def metadata(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
return Response({
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
"paperless__has_archive_version":
|
||||
os.path.isfile(doc.archive_path)
|
||||
})
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
try:
|
||||
response = self.file_response(pk, "inline")
|
||||
response = self.file_response(
|
||||
pk, request, "inline")
|
||||
return response
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document source file does not exist")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
@cache_control(public=False, max_age=315360000)
|
||||
@@ -163,15 +189,16 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
try:
|
||||
return HttpResponse(Document.objects.get(id=pk).thumbnail_file,
|
||||
content_type='image/png')
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document thumbnail does not exist")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def download(self, request, pk=None):
|
||||
try:
|
||||
return self.file_response(pk, "attachment")
|
||||
except FileNotFoundError:
|
||||
raise Http404("Document source file does not exist")
|
||||
return self.file_response(
|
||||
pk, request, "attachment")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
|
||||
class LogViewSet(ReadOnlyModelViewSet):
|
||||
@@ -186,11 +213,62 @@ class LogViewSet(ReadOnlyModelViewSet):
|
||||
ordering_fields = ("created",)
|
||||
|
||||
|
||||
class PostDocumentView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = PostDocumentSerializer
|
||||
parser_classes = (parsers.MultiPartParser,)
|
||||
|
||||
def get_serializer_context(self):
|
||||
return {
|
||||
'request': self.request,
|
||||
'format': self.format_kwarg,
|
||||
'view': self
|
||||
}
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
kwargs['context'] = self.get_serializer_context()
|
||||
return self.serializer_class(*args, **kwargs)
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
doc_name, doc_data = serializer.validated_data.get('document')
|
||||
correspondent_id = serializer.validated_data.get('correspondent')
|
||||
document_type_id = serializer.validated_data.get('document_type')
|
||||
tag_ids = serializer.validated_data.get('tags')
|
||||
title = serializer.validated_data.get('title')
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
f.write(doc_data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=doc_name,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent_id,
|
||||
override_document_type_id=document_type_id,
|
||||
override_tag_ids=tag_ids,
|
||||
task_name=os.path.basename(doc_name)[:100])
|
||||
return Response("OK")
|
||||
|
||||
|
||||
class SearchView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
ix = index.open_index()
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def add_infos_to_hit(self, r):
|
||||
doc = Document.objects.get(id=r['id'])
|
||||
@@ -203,33 +281,42 @@ class SearchView(APIView):
|
||||
}
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'query' in request.query_params:
|
||||
query = request.query_params['query']
|
||||
try:
|
||||
page = int(request.query_params.get('page', 1))
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
with index.query_page(self.ix, query, page) as result_page:
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
|
||||
else:
|
||||
if 'query' not in request.query_params:
|
||||
return Response({
|
||||
'count': 0,
|
||||
'page': 0,
|
||||
'page_count': 0,
|
||||
'results': []})
|
||||
|
||||
query = request.query_params['query']
|
||||
try:
|
||||
page = int(request.query_params.get('page', 1))
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
if page < 1:
|
||||
page = 1
|
||||
|
||||
try:
|
||||
with index.query_page(self.ix, query, page) as (result_page,
|
||||
corrected_query):
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'corrected_query': corrected_query,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
except Exception as e:
|
||||
return HttpResponseBadRequest(str(e))
|
||||
|
||||
|
||||
class SearchAutoCompleteView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
ix = index.open_index()
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
|
||||
self.ix = index.open_index()
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'term' in request.query_params:
|
||||
|
@@ -1,8 +1,19 @@
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.utils.deprecation import MiddlewareMixin
|
||||
from rest_framework import authentication
|
||||
|
||||
|
||||
class AutoLoginMiddleware(MiddlewareMixin):
|
||||
|
||||
def process_request(self, request):
|
||||
try:
|
||||
request.user = User.objects.get(
|
||||
username=settings.AUTO_LOGIN_USERNAME)
|
||||
except User.DoesNotExist:
|
||||
pass
|
||||
|
||||
|
||||
class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
|
||||
""" This class is here to provide authentication to the angular dev server
|
||||
during development. This is disabled in production.
|
||||
|
@@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
|
||||
binaries = (
|
||||
settings.CONVERT_BINARY,
|
||||
settings.OPTIPNG_BINARY,
|
||||
settings.UNPAPER_BINARY,
|
||||
"tesseract"
|
||||
)
|
||||
|
||||
|
@@ -17,16 +17,3 @@ class GnuPG:
|
||||
passphrase = settings.PASSPHRASE
|
||||
|
||||
return cls.gpg.decrypt_file(file_handle, passphrase=passphrase).data
|
||||
|
||||
@classmethod
|
||||
def encrypted(cls, file_handle, passphrase=None):
|
||||
|
||||
if not passphrase:
|
||||
passphrase = settings.PASSPHRASE
|
||||
|
||||
return cls.gpg.encrypt_file(
|
||||
file_handle,
|
||||
recipients=None,
|
||||
passphrase=passphrase,
|
||||
symmetric=True
|
||||
).data
|
||||
|
@@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta
|
||||
|
||||
MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
|
||||
ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
|
||||
ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
|
||||
THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
||||
|
||||
DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
|
||||
@@ -85,6 +86,7 @@ INSTALLED_APPS = [
|
||||
"django.contrib.admin",
|
||||
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
"django_filters",
|
||||
|
||||
"django_q",
|
||||
@@ -96,7 +98,8 @@ INSTALLED_APPS = [
|
||||
REST_FRAMEWORK = {
|
||||
'DEFAULT_AUTHENTICATION_CLASSES': [
|
||||
'rest_framework.authentication.BasicAuthentication',
|
||||
'rest_framework.authentication.SessionAuthentication'
|
||||
'rest_framework.authentication.SessionAuthentication',
|
||||
'rest_framework.authentication.TokenAuthentication'
|
||||
]
|
||||
}
|
||||
|
||||
@@ -156,6 +159,15 @@ CHANNEL_LAYERS = {
|
||||
# Security #
|
||||
###############################################################################
|
||||
|
||||
AUTO_LOGIN_USERNAME = os.getenv("PAPERLESS_AUTO_LOGIN_USERNAME")
|
||||
|
||||
if AUTO_LOGIN_USERNAME:
|
||||
_index = MIDDLEWARE.index('django.contrib.auth.middleware.AuthenticationMiddleware')
|
||||
# This overrides everything the auth middleware is doing but still allows
|
||||
# regular login in case the provided user does not exist.
|
||||
MIDDLEWARE.insert(_index+1, 'paperless.auth.AutoLoginMiddleware')
|
||||
|
||||
|
||||
if DEBUG:
|
||||
X_FRAME_OPTIONS = ''
|
||||
# this should really be 'allow-from uri' but its not supported in any mayor
|
||||
@@ -253,29 +265,48 @@ USE_TZ = True
|
||||
# Logging #
|
||||
###############################################################################
|
||||
|
||||
DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
|
||||
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
'formatters': {
|
||||
'verbose': {
|
||||
'format': '{levelname} {asctime} {module} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
'simple': {
|
||||
'format': '{levelname} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"dbhandler": {
|
||||
"db": {
|
||||
"level": "DEBUG",
|
||||
"class": "documents.loggers.PaperlessHandler",
|
||||
},
|
||||
"streamhandler": {
|
||||
"class": "logging.StreamHandler"
|
||||
"console": {
|
||||
"level": "INFO",
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "verbose",
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"handlers": ["console"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"loggers": {
|
||||
"documents": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
"handlers": ["db"],
|
||||
"propagate": True,
|
||||
},
|
||||
"paperless_mail": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
"handlers": ["db"],
|
||||
"propagate": True,
|
||||
},
|
||||
"paperless_tesseract": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
"handlers": ["db"],
|
||||
"propagate": True,
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -332,6 +363,10 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
|
||||
|
||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||
|
||||
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
|
||||
|
||||
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
|
||||
|
||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
||||
|
||||
OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
|
||||
@@ -340,9 +375,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# OCRmyPDF --output-type options are available.
|
||||
# TODO: validate this setting.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||
# skip. redo, force
|
||||
# TODO: validate this.
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
|
||||
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
||||
|
||||
# GNUPG needs a home directory for some reason
|
||||
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
@@ -351,11 +394,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
|
||||
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||
CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
|
||||
|
||||
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
|
||||
|
||||
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
|
||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||
|
||||
|
||||
# Pre-2.x versions of Paperless stored your documents locally with GPG
|
||||
|
@@ -4,6 +4,7 @@ from django.contrib.auth.decorators import login_required
|
||||
from django.urls import path, re_path
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.generic import RedirectView
|
||||
from rest_framework.authtoken import views
|
||||
from rest_framework.routers import DefaultRouter
|
||||
|
||||
from paperless.consumers import StatusConsumer
|
||||
@@ -16,7 +17,8 @@ from documents.views import (
|
||||
SearchView,
|
||||
IndexView,
|
||||
SearchAutoCompleteView,
|
||||
StatisticsView
|
||||
StatisticsView,
|
||||
PostDocumentView
|
||||
)
|
||||
from paperless.views import FaviconView
|
||||
|
||||
@@ -46,6 +48,11 @@ urlpatterns = [
|
||||
StatisticsView.as_view(),
|
||||
name="statistics"),
|
||||
|
||||
re_path(r"^documents/post_document/", PostDocumentView.as_view(),
|
||||
name="post_document"),
|
||||
|
||||
path('token/', views.obtain_auth_token)
|
||||
|
||||
] + api_router.urls)),
|
||||
|
||||
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||
|
@@ -1 +1 @@
|
||||
__version__ = (0, 9, 1)
|
||||
__version__ = (0, 9, 5)
|
||||
|
@@ -4,6 +4,7 @@ from datetime import timedelta, date
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||
@@ -86,46 +87,6 @@ def make_criterias(rule):
|
||||
return {**criterias, **get_rule_action(rule).get_criteria()}
|
||||
|
||||
|
||||
def get_title(message, att, rule):
|
||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||
title = message.subject
|
||||
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||
title = os.path.splitext(os.path.basename(att.filename))[0]
|
||||
else:
|
||||
raise ValueError("Unknown title selector.")
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def get_correspondent(message, rule):
|
||||
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||
correspondent = None
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||
correspondent_name = message.from_
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||
if message.from_values and \
|
||||
'name' in message.from_values \
|
||||
and message.from_values['name']:
|
||||
correspondent_name = message.from_values['name']
|
||||
else:
|
||||
correspondent_name = message.from_
|
||||
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||
correspondent = rule.assign_correspondent
|
||||
else:
|
||||
raise ValueError("Unknwown correspondent selector")
|
||||
|
||||
return correspondent
|
||||
|
||||
|
||||
def get_mailbox(server, port, security):
|
||||
if security == MailAccount.IMAP_SECURITY_NONE:
|
||||
mailbox = MailBoxUnencrypted(server, port)
|
||||
@@ -140,6 +101,51 @@ def get_mailbox(server, port, security):
|
||||
|
||||
class MailAccountHandler(LoggingMixin):
|
||||
|
||||
def _correspondent_from_name(self, name):
|
||||
try:
|
||||
return Correspondent.objects.get_or_create(
|
||||
name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
except DatabaseError as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"Error while retrieving correspondent {name}: {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
def get_title(self, message, att, rule):
|
||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||
return message.subject
|
||||
|
||||
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||
return os.path.splitext(os.path.basename(att.filename))[0]
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown title selector.")
|
||||
|
||||
def get_correspondent(self, message, rule):
|
||||
c_from = rule.assign_correspondent_from
|
||||
|
||||
if c_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||
return None
|
||||
|
||||
elif c_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||
return self._correspondent_from_name(message.from_)
|
||||
|
||||
elif c_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||
if message.from_values and 'name' in message.from_values and message.from_values['name']: # NOQA: E501
|
||||
return self._correspondent_from_name(
|
||||
message.from_values['name'])
|
||||
else:
|
||||
return self._correspondent_from_name(message.from_)
|
||||
|
||||
elif c_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||
return rule.assign_correspondent
|
||||
|
||||
else:
|
||||
raise ValueError("Unknwown correspondent selector")
|
||||
|
||||
def handle_mail_account(self, account):
|
||||
|
||||
self.renew_logging_group()
|
||||
@@ -156,79 +162,89 @@ class MailAccountHandler(LoggingMixin):
|
||||
M.login(account.username, account.password)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Error while authenticating account {account.name}")
|
||||
f"Error while authenticating account {account}")
|
||||
|
||||
self.log('debug', f"Account {account}: Processing "
|
||||
f"{account.rules.count()} rule(s)")
|
||||
|
||||
for rule in account.rules.order_by('order'):
|
||||
self.log(
|
||||
'debug',
|
||||
f"Account {account}: Processing rule {rule.name}")
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
|
||||
|
||||
try:
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Folder {rule.folder} "
|
||||
f"does not exist in account {account.name}")
|
||||
total_processed_files += self.handle_mail_rule(M, rule)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"Rule {rule}: Error while processing rule: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
return total_processed_files
|
||||
|
||||
def handle_mail_rule(self, M, rule):
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Selecting folder {rule.folder}")
|
||||
|
||||
try:
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule}: Folder {rule.folder} "
|
||||
f"does not exist in account {rule.account}")
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Searching folder with criteria "
|
||||
f"{str(AND(**criterias))}")
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias),
|
||||
mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule}: Error while fetching folder {rule.folder}")
|
||||
|
||||
post_consume_messages = []
|
||||
|
||||
mails_processed = 0
|
||||
total_processed_files = 0
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
processed_files = self.handle_message(message, rule)
|
||||
if processed_files > 0:
|
||||
post_consume_messages.append(message.uid)
|
||||
|
||||
total_processed_files += processed_files
|
||||
mails_processed += 1
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Searching folder with criteria "
|
||||
f"{str(AND(**criterias))}")
|
||||
"error",
|
||||
f"Rule {rule}: Error while processing mail "
|
||||
f"{message.uid}: {e}",
|
||||
exc_info=True)
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias),
|
||||
mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while fetching folder "
|
||||
f"{rule.folder} of account {account.name}")
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Processed {mails_processed} matching mail(s)")
|
||||
|
||||
post_consume_messages = []
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Running mail actions on "
|
||||
f"{len(post_consume_messages)} mails")
|
||||
|
||||
mails_processed = 0
|
||||
try:
|
||||
get_rule_action(rule).post_consume(
|
||||
M,
|
||||
post_consume_messages,
|
||||
rule.action_parameter)
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
processed_files = self.handle_message(message, rule)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing mail "
|
||||
f"{message.uid} of account {account.name}")
|
||||
if processed_files > 0:
|
||||
post_consume_messages.append(message.uid)
|
||||
|
||||
total_processed_files += processed_files
|
||||
mails_processed += 1
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Processed {mails_processed} "
|
||||
f"matching mail(s)")
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Running mail actions on "
|
||||
f"{len(post_consume_messages)} mails")
|
||||
|
||||
try:
|
||||
get_rule_action(rule).post_consume(
|
||||
M,
|
||||
post_consume_messages,
|
||||
rule.action_parameter)
|
||||
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing "
|
||||
f"post-consume actions for account {account.name}")
|
||||
except Exception as e:
|
||||
raise MailError(
|
||||
f"Rule {rule}: Error while processing post-consume actions: "
|
||||
f"{e}")
|
||||
|
||||
return total_processed_files
|
||||
|
||||
@@ -238,11 +254,11 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Processing mail {message.subject} from {message.from_} with "
|
||||
f"{len(message.attachments)} attachment(s)")
|
||||
|
||||
correspondent = get_correspondent(message, rule)
|
||||
correspondent = self.get_correspondent(message, rule)
|
||||
tag = rule.assign_tag
|
||||
doc_type = rule.assign_document_type
|
||||
|
||||
@@ -253,12 +269,12 @@ class MailAccountHandler(LoggingMixin):
|
||||
if not att.content_disposition == "attachment":
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Skipping attachment {att.filename} "
|
||||
f"with content disposition inline")
|
||||
f"with content disposition {att.content_disposition}")
|
||||
continue
|
||||
|
||||
title = get_title(message, att, rule)
|
||||
title = self.get_title(message, att, rule)
|
||||
|
||||
# don't trust the content type of the attachment. Could be
|
||||
# generic application/octet-stream.
|
||||
@@ -274,7 +290,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
self.log(
|
||||
'info',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Consuming attachment {att.filename} from mail "
|
||||
f"{message.subject} from {message.from_}")
|
||||
|
||||
@@ -293,7 +309,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
else:
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Skipping attachment {att.filename} "
|
||||
f"since guessed mime type {mime_type} is not supported "
|
||||
f"by paperless")
|
||||
|
@@ -139,4 +139,4 @@ class MailRule(models.Model):
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
return f"{self.account.name}.{self.name}"
|
||||
|
@@ -1,14 +1,20 @@
|
||||
import logging
|
||||
|
||||
from paperless_mail.mail import MailAccountHandler
|
||||
from paperless_mail.mail import MailAccountHandler, MailError
|
||||
from paperless_mail.models import MailAccount
|
||||
|
||||
|
||||
def process_mail_accounts():
|
||||
total_new_documents = 0
|
||||
for account in MailAccount.objects.all():
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(
|
||||
account)
|
||||
try:
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(
|
||||
account)
|
||||
except MailError as e:
|
||||
logging.getLogger(__name__).error(
|
||||
f"Error while processing mail account {account}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
if total_new_documents > 0:
|
||||
return f"Added {total_new_documents} document(s)."
|
||||
@@ -17,8 +23,8 @@ def process_mail_accounts():
|
||||
|
||||
|
||||
def process_mail_account(name):
|
||||
account = MailAccount.objects.find(name=name)
|
||||
if account:
|
||||
try:
|
||||
account = MailAccount.objects.get(name=name)
|
||||
MailAccountHandler().handle_mail_account(account)
|
||||
else:
|
||||
logging.error("Unknown mail acccount: {}".format(name))
|
||||
except MailAccount.DoesNotExist:
|
||||
logging.getLogger(__name__).error(f"Unknown mail acccount: {name}")
|
||||
|
@@ -3,11 +3,14 @@ from collections import namedtuple
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase
|
||||
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
||||
|
||||
from documents.models import Correspondent
|
||||
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
|
||||
from paperless_mail import tasks
|
||||
from paperless_mail.mail import MailError, MailAccountHandler
|
||||
from paperless_mail.models import MailRule, MailAccount
|
||||
|
||||
|
||||
@@ -163,28 +166,30 @@ class TestMail(TestCase):
|
||||
me_localhost = Correspondent.objects.create(name=message2.from_)
|
||||
someone_else = Correspondent.objects.create(name="someone else")
|
||||
|
||||
handler = MailAccountHandler()
|
||||
|
||||
rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
|
||||
self.assertIsNone(get_correspondent(message, rule))
|
||||
self.assertIsNone(handler.get_correspondent(message, rule))
|
||||
|
||||
rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||
c = get_correspondent(message, rule)
|
||||
c = handler.get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "someone@somewhere.com")
|
||||
c = get_correspondent(message2, rule)
|
||||
c = handler.get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "me@localhost.com")
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
|
||||
c = get_correspondent(message, rule)
|
||||
c = handler.get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "Someone!")
|
||||
c = get_correspondent(message2, rule)
|
||||
c = handler.get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
|
||||
c = get_correspondent(message, rule)
|
||||
c = handler.get_correspondent(message, rule)
|
||||
self.assertEqual(c, someone_else)
|
||||
|
||||
def test_get_title(self):
|
||||
@@ -192,10 +197,13 @@ class TestMail(TestCase):
|
||||
message.subject = "the message title"
|
||||
att = namedtuple('Attachment', [])
|
||||
att.filename = "this_is_the_file.pdf"
|
||||
|
||||
handler = MailAccountHandler()
|
||||
|
||||
rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
|
||||
self.assertEqual(handler.get_title(message, att, rule), "this_is_the_file")
|
||||
rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
|
||||
self.assertEqual(get_title(message, att, rule), "the message title")
|
||||
self.assertEqual(handler.get_title(message, att, rule), "the message title")
|
||||
|
||||
def test_handle_message(self):
|
||||
message = create_message(subject="the message title", from_="Myself", num_attachments=2)
|
||||
@@ -317,7 +325,7 @@ class TestMail(TestCase):
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
def test_errors(self):
|
||||
def test_error_login(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
||||
|
||||
try:
|
||||
@@ -327,26 +335,84 @@ class TestMail(TestCase):
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def test_error_skip_account(self):
|
||||
account_faulty = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wroasdng")
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
|
||||
action_parameter="spam", filter_subject="Claim")
|
||||
|
||||
tasks.process_mail_accounts()
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
def test_error_skip_rule(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
|
||||
action_parameter="spam", filter_subject="Claim", order=1, folder="uuuhhhh")
|
||||
rule2 = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE,
|
||||
action_parameter="spam", filter_subject="Claim", order=2)
|
||||
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
|
||||
@mock.patch("paperless_mail.mail.MailAccountHandler.get_correspondent")
|
||||
def test_error_skip_mail(self, m):
|
||||
|
||||
def get_correspondent_fake(message, rule):
|
||||
if message.from_ == 'amazon@amazon.de':
|
||||
raise ValueError("Does not compute.")
|
||||
else:
|
||||
return None
|
||||
|
||||
m.side_effect = get_correspondent_fake
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam")
|
||||
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
|
||||
# test that we still consume mail even if some mails throw errors.
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
|
||||
# faulty mail still in inbox, untouched
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||
self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de')
|
||||
|
||||
def test_error_create_correspondent(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(
|
||||
name="testrule", filter_from="amazon@amazon.de",
|
||||
account=account, action=MailRule.ACTION_MOVE, action_parameter="spam",
|
||||
assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
|
||||
self.async_task.assert_called_once()
|
||||
args, kwargs = self.async_task.call_args
|
||||
|
||||
c = Correspondent.objects.get(name="amazon@amazon.de")
|
||||
# should work
|
||||
self.assertEquals(kwargs['override_correspondent_id'], c.id)
|
||||
|
||||
self.async_task.reset_mock()
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
with mock.patch("paperless_mail.mail.Correspondent.objects.get_or_create") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("uuuh does not exist" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||
args, kwargs = self.async_task.call_args
|
||||
self.async_task.assert_called_once()
|
||||
self.assertEquals(kwargs['override_correspondent_id'], None)
|
||||
|
||||
rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("Error while processing post-consume actions" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def test_filters(self):
|
||||
|
||||
@@ -390,3 +456,43 @@ class TestMail(TestCase):
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 5)
|
||||
|
||||
class TestManagementCommand(TestCase):
|
||||
|
||||
@mock.patch("paperless_mail.management.commands.mail_fetcher.tasks.process_mail_accounts")
|
||||
def test_mail_fetcher(self, m):
|
||||
|
||||
call_command("mail_fetcher")
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
class TestTasks(TestCase):
|
||||
|
||||
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
|
||||
def test_all_accounts(self, m):
|
||||
m.side_effect = lambda account: 6
|
||||
|
||||
MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
|
||||
MailAccount.objects.create(name="B", imap_server="A", username="A", password="A")
|
||||
|
||||
result = tasks.process_mail_accounts()
|
||||
|
||||
self.assertEqual(m.call_count, 2)
|
||||
self.assertIn("Added 12", result)
|
||||
|
||||
m.side_effect = lambda account: 0
|
||||
result = tasks.process_mail_accounts()
|
||||
self.assertIn("No new", result)
|
||||
|
||||
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
|
||||
def test_single_accounts(self, m):
|
||||
|
||||
MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
|
||||
|
||||
tasks.process_mail_account("A")
|
||||
|
||||
m.assert_called_once()
|
||||
m.reset_mock()
|
||||
|
||||
tasks.process_mail_account("B")
|
||||
m.assert_not_called()
|
||||
|
@@ -0,0 +1,2 @@
|
||||
# this is here so that django finds the checks.
|
||||
from .checks import *
|
||||
|
34
src/paperless_tesseract/checks.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import subprocess
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error, register
|
||||
|
||||
|
||||
def get_tesseract_langs():
|
||||
with subprocess.Popen(['tesseract', '--list-langs'],
|
||||
stdout=subprocess.PIPE) as p:
|
||||
stdout, stderr = p.communicate()
|
||||
|
||||
return stdout.decode().strip().split("\n")[1:]
|
||||
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
if not settings.OCR_LANGUAGE:
|
||||
return [Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english."
|
||||
)]
|
||||
|
||||
specified_langs = settings.OCR_LANGUAGE.split("+")
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
return [Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
|
||||
|
||||
return []
|
@@ -1,23 +1,15 @@
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import langdetect
|
||||
import ocrmypdf
|
||||
import pdftotext
|
||||
import pyocr
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from pyocr import PyocrException
|
||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
|
||||
run_convert
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
from documents.parsers import DocumentParser, ParseError, run_convert
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
@@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group, progress_callback):
|
||||
super().__init__(path, logging_group, progress_callback)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
@@ -43,8 +31,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
input_file="{}[0]".format(self.document_path),
|
||||
trim=False,
|
||||
input_file="{}[0]".format(document_path),
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
except ParseError:
|
||||
@@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"-q",
|
||||
"-sDEVICE=pngalpha",
|
||||
"-o", gs_out_path,
|
||||
self.document_path]
|
||||
document_path]
|
||||
if not subprocess.Popen(cmd).wait() == 0:
|
||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||
# then run convert on the output from gs
|
||||
@@ -67,187 +55,160 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
trim=False,
|
||||
input_file=gs_out_path,
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
return out_path
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
return len(text) > 50
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
if not settings.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("debug", "Skipping OCR, using Text from PDF")
|
||||
self._text = get_text_from_pdf(self.document_path)
|
||||
return self._text
|
||||
|
||||
self.progress_callback(0, 1, "Making greyscale images.")
|
||||
images = self._get_greyscale()
|
||||
|
||||
if not images:
|
||||
raise ParseError("Empty document, nothing to do.")
|
||||
def is_image(self, mime_type):
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
]
|
||||
|
||||
def get_dpi(self, image):
|
||||
try:
|
||||
|
||||
sample_page_index = int(len(images) / 2)
|
||||
self.log(
|
||||
"debug",
|
||||
f"Attempting language detection on page "
|
||||
f"{sample_page_index + 1} of {len(images)}...")
|
||||
self.progress_callback(0.4, 1, "Language Detection.")
|
||||
sample_page_text = self._ocr([images[sample_page_index]],
|
||||
settings.OCR_LANGUAGE)[0]
|
||||
guessed_language = self._guess_language(sample_page_text)
|
||||
self.progress_callback(0.6, 1, "OCR all the pages.")
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected language: {guessed_language} "
|
||||
f"(default language)")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
|
||||
self.log(
|
||||
"warning",
|
||||
f"Detected language {guessed_language} is not available "
|
||||
f"on this system.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
else:
|
||||
self.log("debug", f"Detected language: {guessed_language}")
|
||||
ocr_pages = self._ocr(
|
||||
images, ISO639[guessed_language], report_progress=True)
|
||||
|
||||
self.log("debug", "OCR completed.")
|
||||
self._text = strip_excess_whitespace(" ".join(ocr_pages))
|
||||
return self._text
|
||||
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
def _get_greyscale(self):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
input_file = self.document_path
|
||||
|
||||
if settings.OCR_PAGES == 1:
|
||||
input_file += "[0]"
|
||||
elif settings.OCR_PAGES > 1:
|
||||
input_file += f"[0-{settings.OCR_PAGES - 1}]"
|
||||
|
||||
self.log(
|
||||
"debug",
|
||||
f"Converting document {input_file} into greyscale images")
|
||||
|
||||
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
|
||||
run_convert(density=settings.CONVERT_DENSITY,
|
||||
depth="8",
|
||||
type="grayscale",
|
||||
input_file=input_file,
|
||||
output_file=output_files,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
self.log("debug", f"Running unpaper on {len(pnms)} pages...")
|
||||
|
||||
self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms)))
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
pnms = pool.map(run_unpaper, pnms)
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
return guess
|
||||
with Image.open(image) as im:
|
||||
x, y = im.info['dpi']
|
||||
return x
|
||||
except Exception as e:
|
||||
self.log('warning', f"Language detection failed with: {e}")
|
||||
self.log(
|
||||
'warning',
|
||||
f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def _ocr(self, imgs, lang, report_progress=False):
|
||||
self.log(
|
||||
"debug",
|
||||
f"Performing OCR on {len(imgs)} page(s) with language {lang}")
|
||||
r = []
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
# r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
for i, page in enumerate(pool.imap(image_to_string, itertools.product(imgs, [lang]))):
|
||||
if report_progress:
|
||||
self.progress_callback(0.6 + (i / len(imgs)) * 0.4, 1, "OCR'ed {} pages".format(i+1))
|
||||
r += [page]
|
||||
return r
|
||||
def parse(self, document_path, mime_type):
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
def _complete_ocr_default_language(self,
|
||||
images,
|
||||
sample_page_index,
|
||||
sample_page):
|
||||
images_copy = list(images)
|
||||
del images_copy[sample_page_index]
|
||||
if images_copy:
|
||||
self.log('debug', "Continuing ocr with default language.")
|
||||
ocr_pages = self._ocr(
|
||||
images_copy, settings.OCR_LANGUAGE, report_progress=True)
|
||||
ocr_pages.insert(sample_page_index, sample_page)
|
||||
return ocr_pages
|
||||
text_original = get_text_from_pdf(document_path)
|
||||
has_text = text_original and len(text_original) > 50
|
||||
|
||||
if mode == "skip_noarchive" and has_text:
|
||||
self.log("debug",
|
||||
"Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
if mode in ['skip', 'skip_noarchive'] and not has_text:
|
||||
# upgrade to redo, since there appears to be no text in the
|
||||
# document. This happens to some weird encrypted documents or
|
||||
# documents with failed OCR attempts for which OCRmyPDF will
|
||||
# still report that there actually is text in them.
|
||||
self.log("debug",
|
||||
"No text was found in the document and skip is "
|
||||
"specified. Upgrading OCR mode to redo.")
|
||||
mode = "redo"
|
||||
|
||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||
|
||||
ocr_args = {
|
||||
'input_file': document_path,
|
||||
'output_file': archive_path,
|
||||
'use_threads': True,
|
||||
'jobs': settings.THREADS_PER_WORKER,
|
||||
'language': settings.OCR_LANGUAGE,
|
||||
'output_type': settings.OCR_OUTPUT_TYPE,
|
||||
'progress_bar': False,
|
||||
'clean': True
|
||||
}
|
||||
|
||||
if settings.OCR_PAGES > 0:
|
||||
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
|
||||
|
||||
# Mode selection.
|
||||
|
||||
if mode in ['skip', 'skip_noarchive']:
|
||||
ocr_args['skip_text'] = True
|
||||
elif mode == 'redo':
|
||||
ocr_args['redo_ocr'] = True
|
||||
elif mode == 'force':
|
||||
ocr_args['force_ocr'] = True
|
||||
else:
|
||||
return [sample_page]
|
||||
raise ParseError(
|
||||
f"Invalid ocr mode: {mode}")
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
if dpi:
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected DPI for image {document_path}: {dpi}"
|
||||
)
|
||||
ocr_args['image_dpi'] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {document_path}, "
|
||||
f"no DPI information is present in this image and "
|
||||
f"OCR_IMAGE_DPI is not set.")
|
||||
|
||||
if settings.OCR_USER_ARGS:
|
||||
try:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
ocr_args = {**ocr_args, **user_args}
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used: {e}")
|
||||
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ['OMP_THREAD_LIMIT'] = "1"
|
||||
|
||||
try:
|
||||
self.log("debug",
|
||||
f"Calling OCRmyPDF with {str(ocr_args)}")
|
||||
ocrmypdf.ocr(**ocr_args)
|
||||
# success! announce results
|
||||
self.archive_path = archive_path
|
||||
self.text = get_text_from_pdf(archive_path)
|
||||
|
||||
except (InputFileError, EncryptedPdfError) as e:
|
||||
|
||||
self.log("debug",
|
||||
f"Encountered an error: {e}. Trying to use text from "
|
||||
f"original.")
|
||||
# This happens with some PDFs when used with the redo_ocr option.
|
||||
# This is not the end of the world, we'll just use what we already
|
||||
# have in the document.
|
||||
self.text = text_original
|
||||
# Also, no archived file.
|
||||
if not self.text:
|
||||
# However, if we don't have anything, fail:
|
||||
raise ParseError(e)
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(e)
|
||||
|
||||
if not self.text:
|
||||
# This may happen for files that don't have any text.
|
||||
self.log(
|
||||
'warning',
|
||||
f"Document {document_path} does not have any text."
|
||||
f"This is probably an error or you tried to add an image "
|
||||
f"without text, or something is wrong with this document.")
|
||||
self.text = ""
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub(
|
||||
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(img) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except Exception:
|
||||
# Rotation not possible, ignore
|
||||
pass
|
||||
try:
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
except PyocrException as e:
|
||||
raise OCRError(e)
|
||||
# TODO: this needs a rework
|
||||
return no_trailing_whitespace.strip()
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
@@ -256,6 +217,9 @@ def get_text_from_pdf(pdf_file):
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
except pdftotext.Error:
|
||||
return ""
|
||||
# might not be a PDF file
|
||||
return None
|
||||
|
||||
return "\n".join(pdf)
|
||||
text = "\n".join(pdf)
|
||||
|
||||
return strip_excess_whitespace(text)
|
||||
|
@@ -5,9 +5,12 @@ def tesseract_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0,
|
||||
"mime_types": [
|
||||
"application/pdf",
|
||||
"image/jpeg",
|
||||
"image/png"
|
||||
]
|
||||
"mime_types": {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/tiff": ".tif",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
}
|
||||
}
|
||||
|
BIN
src/paperless_tesseract/tests/samples/multi-page-digital.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/multi-page-images.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/no-text-alpha.png
Normal file
After Width: | Height: | Size: 32 KiB |
BIN
src/paperless_tesseract/tests/samples/simple-alpha.png
Normal file
After Width: | Height: | Size: 8.2 KiB |
BIN
src/paperless_tesseract/tests/samples/simple-digital.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/simple-no-dpi.png
Normal file
After Width: | Height: | Size: 6.8 KiB |
BIN
src/paperless_tesseract/tests/samples/simple.bmp
Normal file
After Width: | Height: | Size: 1.7 MiB |
BIN
src/paperless_tesseract/tests/samples/simple.gif
Normal file
After Width: | Height: | Size: 18 KiB |
BIN
src/paperless_tesseract/tests/samples/simple.jpg
Normal file
After Width: | Height: | Size: 19 KiB |
Before Width: | Height: | Size: 7.7 KiB After Width: | Height: | Size: 7.2 KiB |
BIN
src/paperless_tesseract/tests/samples/simple.tif
Normal file
BIN
src/paperless_tesseract/tests/samples/with-form.pdf
Normal file
@@ -1,193 +0,0 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
|
||||
def setUp(self):
|
||||
os.makedirs(self.SCRATCH, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.SCRATCH)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_1(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_2(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_3(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_4(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_5(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_6(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_7(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_8(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_9(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-0590 00:00:00"
|
||||
)
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_crazy_date_past(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-2350 00:00:00"
|
||||
)
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_crazy_date_future(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="20 408000l 2475"
|
||||
)
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_crazy_date_with_spaces(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="No date in here"
|
||||
)
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
@@ -1,76 +0,0 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..parsers import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])
|
@@ -1,46 +1,17 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
from pyocr.error import TesseractError
|
||||
|
||||
from documents.parsers import ParseError, run_convert
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise TesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def get_available_languages():
|
||||
return ['eng', 'deu']
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
image_to_string_calls.append((file_handle.name, lang))
|
||||
return file_handle.read()
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
def fake_convert(input_file, output_file, **kwargs):
|
||||
with open(input_file) as f:
|
||||
lines = f.readlines()
|
||||
@@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs):
|
||||
f2.write(line.strip())
|
||||
|
||||
|
||||
def fake_unpaper(pnm):
|
||||
output = pnm + ".unpaper.pnm"
|
||||
shutil.copy(pnm, output)
|
||||
return output
|
||||
|
||||
|
||||
class FakeImageFile(ContextManager):
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
@@ -67,142 +32,50 @@ class FakeImageFile(ContextManager):
|
||||
return os.path.basename(self.fname)
|
||||
|
||||
|
||||
fake_image = FakeImageFile
|
||||
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
|
||||
@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
|
||||
@mock.patch("paperless_tesseract.parsers.Image.open", open)
|
||||
class TestRasterisedDocumentParser(TestCase):
|
||||
class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
def assertContainsStrings(self, content, strings):
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = [content.index(s) for s in strings]
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
global image_to_string_calls
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
def get_input_file(self, pages):
|
||||
_, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
|
||||
with open(fname, "w") as f:
|
||||
f.writelines([f"line {p}\n" for p in range(pages)])
|
||||
return fname
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_simple_language_match(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_2_pages(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_3_pages(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
|
||||
def test_parse_text_lang_detect_failed(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
|
||||
def test_parse_text_lang_not_installed(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2 line 3")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
|
||||
def test_parse_text_lang_mismatch(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
|
||||
def test_parse_empty_doc(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
|
||||
try:
|
||||
parser.get_text()
|
||||
except ParseError as e:
|
||||
self.assertEqual("Empty document, nothing to do.", str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
|
||||
class TestAuxilliaryFunctions(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
|
||||
override_settings(SCRATCH_DIR=self.scratch).enable()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch)
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
|
||||
def test_get_text_from_pdf(self):
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'))
|
||||
|
||||
self.assertEqual(text.strip(), "This is a test document.")
|
||||
|
||||
def test_get_text_from_pdf_error(self):
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
|
||||
|
||||
self.assertEqual(text.strip(), "")
|
||||
|
||||
def test_image_to_string(self):
|
||||
text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
|
||||
|
||||
self.assertEqual(text, "This is a test document.")
|
||||
|
||||
def test_image_to_string_language_unavailable(self):
|
||||
try:
|
||||
image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
|
||||
except OCRError as e:
|
||||
self.assertTrue("Failed loading language" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
@override_settings(OCR_ALWAYS=False)
|
||||
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
|
||||
def test_is_ocred(self, m2, m):
|
||||
parser = RasterisedDocumentParser("", uuid.uuid4())
|
||||
m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
|
||||
"lots of text lots of text lots of text lots of text lots of text lots of text " \
|
||||
"lots of text lots of text lots of text lots of text lots of text lots of text "
|
||||
parser.get_text()
|
||||
self.assertEqual(m.call_count, 2)
|
||||
self.assertEqual(m2.call_count, 0)
|
||||
self.assertContainsStrings(text.strip(), ["This is a test document."])
|
||||
|
||||
def test_thumbnail(self):
|
||||
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
|
||||
parser.get_thumbnail()
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.run_convert")
|
||||
@@ -216,6 +89,191 @@ class TestAuxilliaryFunctions(TestCase):
|
||||
|
||||
m.side_effect = call_convert
|
||||
|
||||
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
|
||||
parser.get_thumbnail()
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
||||
|
||||
def test_get_dpi(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
|
||||
self.assertEqual(dpi, None)
|
||||
|
||||
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
|
||||
self.assertEqual(dpi, 72)
|
||||
|
||||
def test_simple_digital(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_with_form(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
def test_with_form_error(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None)
|
||||
def test_with_form_error_notext(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@override_settings(OCR_MODE="force")
|
||||
def test_with_form_force(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||
|
||||
def test_image_simple(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_image_simple_alpha_fail(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png")
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
|
||||
def test_image_no_dpi_fail(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=72)
|
||||
def test_image_no_dpi_default(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
|
||||
|
||||
def test_multi_page(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
|
||||
def test_multi_page_pages_skip(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||
def test_multi_page_pages_redo(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="force")
|
||||
def test_multi_page_pages_force(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OOCR_MODE="skip")
|
||||
def test_multi_page_analog_pages_skip(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||
def test_multi_page_analog_pages_redo(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
|
||||
self.assertFalse("page 3" in parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
||||
def test_multi_page_analog_pages_force(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
|
||||
self.assertFalse("page 2" in parser.get_text().lower())
|
||||
self.assertFalse("page 3" in parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_withtext(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_notext(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.join(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
|
||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
|
||||
def test_bmp(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
|
||||
def test_jpg(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=200)
|
||||
def test_gif(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
|
||||
def test_tiff(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertTrue("this is a test document" in parser.get_text().lower())
|
||||
|
@@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
|
||||
This parser directly parses a text document (.txt, .md, or .csv)
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group):
|
||||
super().__init__(path, logging_group)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a text file is just a 500px wide image of the text
|
||||
rendered onto a letter-sized page.
|
||||
@@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
|
||||
)
|
||||
|
||||
def read_text():
|
||||
with open(self.document_path, 'r') as src:
|
||||
with open(document_path, 'r') as src:
|
||||
lines = [line.strip() for line in src.readlines()]
|
||||
text = "\n".join([line for line in lines[:n_lines]])
|
||||
return text.replace('"', "'")
|
||||
@@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
return out_path
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
with open(self.document_path, 'r') as f:
|
||||
self._text = f.read()
|
||||
|
||||
return self._text
|
||||
def parse(self, document_path, mime_type):
|
||||
with open(document_path, 'r') as f:
|
||||
self.text = f.read()
|
||||
|
||||
|
||||
def run_command(*args):
|
||||
|
@@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": TextDocumentParser,
|
||||
"weight": 10,
|
||||
"mime_types": [
|
||||
"text/plain",
|
||||
"text/comma-separated-values"
|
||||
]
|
||||
"mime_types": {
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv",
|
||||
}
|
||||
}
|
||||
|
@@ -3,10 +3,9 @@ exclude = migrations, paperless/settings.py, .tox, */tests/*
|
||||
|
||||
[tool:pytest]
|
||||
DJANGO_SETTINGS_MODULE=paperless.settings
|
||||
addopts = --pythonwarnings=all
|
||||
addopts = --pythonwarnings=all --cov --cov-report=html -n auto
|
||||
env =
|
||||
PAPERLESS_SECRET=paperless
|
||||
PAPERLESS_EMAIL_SECRET=paperless
|
||||
PAPERLESS_DISABLE_DBHANDLER=true
|
||||
|
||||
|
||||
[coverage:run]
|
||||
|