Merge branch 'dev' into feature-bulk-edit

This commit is contained in:
jonaswinkler
2020-12-06 02:12:15 +01:00
115 changed files with 3607 additions and 1553 deletions

View File

@@ -6,13 +6,15 @@ import os
import magic
from django.conf import settings
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory
from .file_handling import create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class_for_mime_type
from .parsers import ParseError, get_parser_class_for_mime_type, \
get_supported_file_extensions, parse_date
from .signals import (
document_consumption_finished,
document_consumption_started
@@ -42,7 +44,7 @@ class Consumer(LoggingMixin):
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(checksum=checksum).exists():
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
raise ConsumerError(
@@ -53,6 +55,7 @@ class Consumer(LoggingMixin):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
def try_consume_file(self,
path,
@@ -107,7 +110,7 @@ class Consumer(LoggingMixin):
# This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.path, self.logging_group)
document_parser = parser_class(self.logging_group)
# However, this already created working directories which we have to
# clean up.
@@ -115,13 +118,24 @@ class Consumer(LoggingMixin):
# Parse the document. This may take some time.
try:
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail()
self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type)
self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail(
self.path, mime_type)
text = document_parser.get_text()
date = document_parser.get_date()
if not date:
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
except ParseError as e:
document_parser.cleanup()
self.log(
"error",
f"Error while consuming document {self.filename}: {e}")
raise ConsumerError(e)
# Prepare the document classifier.
@@ -163,9 +177,24 @@ class Consumer(LoggingMixin):
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
# TODO: not required, since this is done by the file handling
# logic
create_source_path_directory(document.source_path)
self._write(document, self.path, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
self._write(document.storage_type,
self.path, document.source_path)
self._write(document.storage_type,
thumbnail, document.thumbnail_path)
if archive_path and os.path.isfile(archive_path):
self._write(document.storage_type,
archive_path, document.archive_path)
with open(archive_path, 'rb') as f:
document.archive_checksum = hashlib.md5(
f.read()).hexdigest()
document.save()
# Afte performing all database operations and moving files
# into place, tell paperless where the file is.
@@ -178,6 +207,11 @@ class Consumer(LoggingMixin):
self.log("debug", "Deleting file {}".format(self.path))
os.unlink(self.path)
except Exception as e:
self.log(
"error",
f"The following error occured while consuming "
f"{self.filename}: {e}"
)
raise ConsumerError(e)
finally:
document_parser.cleanup()
@@ -242,7 +276,7 @@ class Consumer(LoggingMixin):
for tag_id in self.override_tag_ids:
document.tags.add(Tag.objects.get(pk=tag_id))
def _write(self, document, source, target):
def _write(self, storage_type, source, target):
with open(source, "rb") as read_file:
with open(target, "wb") as write_file:
write_file.write(read_file.read())

View File

@@ -10,10 +10,13 @@ def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True)
def delete_empty_directories(directory):
def delete_empty_directories(directory, root):
if not os.path.isdir(directory):
return
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(settings.ORIGINALS_DIR)
root = os.path.normpath(root)
if not directory.startswith(root + os.path.sep):
# don't do anything outside our originals folder.
@@ -101,3 +104,8 @@ def generate_filename(doc):
filename += ".gpg"
return filename
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"

View File

@@ -1,59 +0,0 @@
import os
import tempfile
from datetime import datetime
from time import mktime
import magic
from django import forms
from django.conf import settings
from django_q.tasks import async_task
from pathvalidate import validate_filename, ValidationError
from documents.parsers import is_mime_type_supported
class UploadForm(forms.Form):
document = forms.FileField()
def clean_document(self):
document_name = self.cleaned_data.get("document").name
try:
validate_filename(document_name)
except ValidationError:
raise forms.ValidationError("That filename is suspicious.")
document_data = self.cleaned_data.get("document").read()
mime_type = magic.from_buffer(document_data, mime=True)
if not is_mime_type_supported(mime_type):
raise forms.ValidationError("This mime type is not supported.")
return document_name, document_data
def save(self):
"""
Since the consumer already does a lot of work, it's easier just to save
to-be-consumed files to the consumption directory rather than have the
form do that as well. Think of it as a poor-man's queue server.
"""
original_filename, data = self.cleaned_data.get("document")
t = int(mktime(datetime.now().timetuple()))
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
dir=settings.SCRATCH_DIR,
delete=False) as f:
f.write(data)
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file",
f.name,
override_filename=original_filename,
task_name=os.path.basename(original_filename)[:100])

View File

@@ -4,10 +4,11 @@ from contextlib import contextmanager
from django.conf import settings
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.writing import AsyncWriter
@@ -59,14 +60,19 @@ def get_schema():
id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True),
content=TEXT(),
correspondent=TEXT(stored=True)
correspondent=TEXT(stored=True),
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
type=TEXT(stored=True),
created=DATETIME(stored=True, sortable=True),
modified=DATETIME(stored=True, sortable=True),
added=DATETIME(stored=True, sortable=True),
)
def open_index(recreate=False):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR)
return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception as e:
logger.error(f"Error while opening the index: {e}, recreating.")
@@ -76,16 +82,27 @@ def open_index(recreate=False):
def update_document(writer, doc):
# TODO: this line caused many issues all around, since:
# We need to make sure that this method does not get called with
# deserialized documents (i.e, document objects that don't come from
# Django's ORM interfaces directly.
logger.debug("Indexing {}...".format(doc))
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
id=doc.pk,
title=doc.title,
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None
correspondent=doc.correspondent.name if doc.correspondent else None,
tag=tags if tags else None,
type=doc.document_type.name if doc.document_type else None,
created=doc.created,
added=doc.added,
modified=doc.modified,
)
def remove_document(writer, doc):
# TODO: see above.
logger.debug("Removing {} from index...".format(doc))
writer.delete_by_term('id', doc.pk)
@@ -103,16 +120,27 @@ def remove_document_from_index(document):
@contextmanager
def query_page(ix, query, page):
def query_page(ix, querystring, page):
searcher = ix.searcher()
try:
query_parser = MultifieldParser(["content", "title", "correspondent"],
ix.schema).parse(query)
result_page = searcher.search_page(query_parser, page)
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
ix.schema)
qp.add_plugin(DateParserPlugin())
q = qp.parse(querystring)
result_page = searcher.search_page(q, page)
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
yield result_page
corrected = searcher.correct_query(q, querystring)
if corrected.query != q:
corrected_query = corrected.string
else:
corrected_query = None
yield result_page, corrected_query
finally:
searcher.close()

View File

@@ -28,10 +28,10 @@ class LoggingMixin:
def renew_logging_group(self):
self.logging_group = uuid.uuid4()
def log(self, level, message):
def log(self, level, message, **kwargs):
target = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(target)
getattr(logger, level)(message, extra={
"group": self.logging_group
})
}, **kwargs)

View File

@@ -0,0 +1,128 @@
import hashlib
import multiprocessing
import logging
import os
import shutil
import uuid
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from whoosh.writing import AsyncWriter
from documents.models import Document
from ... import index
from ...file_handling import create_source_path_directory
from ...mixins import Renderable
from ...parsers import get_parser_class_for_mime_type
logger = logging.getLogger(__name__)
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), 'rb') as f:
checksum = hashlib.md5(f.read()).hexdigest()
# i'm going to save first so that in case the file move
# fails, the database is rolled back.
# we also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text()
)
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
with AsyncWriter(index.open_index()) as writer:
index.update_document(writer, document)
except Exception as e:
logger.error(f"Error while parsing document {document}: {str(e)}")
finally:
parser.cleanup()
class Command(Renderable, BaseCommand):
help = """
Using the current classification model, assigns correspondents, tags
and document types to all documents, effectively allowing you to
back-tag all previously indexed documents with metadata created (or
modified) after their initial import.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"-f", "--overwrite",
default=False,
action="store_true",
help="Recreates the archived document for documents that already "
"have an archived version."
)
parser.add_argument(
"-d", "--document",
default=None,
type=int,
required=False,
help="Specify the ID of a document, and this command will only "
"run on this specific document."
)
def handle(self, *args, **options):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
overwrite = options["overwrite"]
if options['document']:
documents = Document.objects.filter(pk=options['document'])
else:
documents = Document.objects.all()
document_ids = list(map(
lambda doc: doc.id,
filter(
lambda d: overwrite or not d.archive_checksum,
documents
)
))
# Note to future self: this prevents django from reusing database
# conncetions between processes, which is bad and does not work
# with postgres.
db.connections.close_all()
try:
logging.getLogger().handlers[0].level = logging.ERROR
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(tqdm.tqdm(
pool.imap_unordered(
handle_document,
document_ids
),
total=len(document_ids)
))
except KeyboardInterrupt:
print("Aborting...")

View File

@@ -1,31 +1,69 @@
import logging
import os
from pathlib import Path
from time import sleep
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.utils.text import slugify
from django_q.tasks import async_task
from watchdog.events import FileSystemEventHandler
from watchdog.observers.polling import PollingObserver
from documents.models import Tag
from documents.parsers import is_file_ext_supported
try:
from inotify_simple import INotify, flags
from inotifyrecursive import INotify, flags
except ImportError:
INotify = flags = None
logger = logging.getLogger(__name__)
def _consume(file):
try:
if os.path.isfile(file):
async_task("documents.tasks.consume_file",
file,
task_name=os.path.basename(file)[:100])
else:
logger.debug(
f"Not consuming file {file}: File has moved.")
def _tags_from_path(filepath):
"""Walk up the directory tree from filepath to CONSUMPTION_DIr
and get or create Tag IDs for every directory.
"""
tag_ids = set()
path_parts = Path(filepath).relative_to(
settings.CONSUMPTION_DIR).parent.parts
for part in path_parts:
tag_ids.add(Tag.objects.get_or_create(
slug=slugify(part),
defaults={"name": part},
)[0].pk)
return tag_ids
def _consume(filepath):
if os.path.isdir(filepath):
return
if not os.path.isfile(filepath):
logger.debug(
f"Not consuming file {filepath}: File has moved.")
return
if not is_file_ext_supported(os.path.splitext(filepath)[1]):
logger.debug(
f"Not consuming file {filepath}: Unknown file extension.")
return
tag_ids = None
try:
if settings.CONSUMER_SUBDIRS_AS_TAGS:
tag_ids = _tags_from_path(filepath)
except Exception as e:
logger.error(
"Error creating tags from path: {}".format(e))
try:
async_task("documents.tasks.consume_file",
filepath,
override_tag_ids=tag_ids if tag_ids else None,
task_name=os.path.basename(filepath)[:100])
except Exception as e:
# Catch all so that the consumer won't crash.
# This is also what the test case is listening for to check for
@@ -94,6 +132,7 @@ class Command(BaseCommand):
def handle(self, *args, **options):
directory = options["directory"]
recursive = settings.CONSUMER_RECURSIVE
if not directory:
raise CommandError(
@@ -104,24 +143,30 @@ class Command(BaseCommand):
raise CommandError(
f"Consumption directory {directory} does not exist")
for entry in os.scandir(directory):
_consume(entry.path)
if recursive:
for dirpath, _, filenames in os.walk(directory):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
_consume(filepath)
else:
for entry in os.scandir(directory):
_consume(entry.path)
if options["oneshot"]:
return
if settings.CONSUMER_POLLING == 0 and INotify:
self.handle_inotify(directory)
self.handle_inotify(directory, recursive)
else:
self.handle_polling(directory)
self.handle_polling(directory, recursive)
logger.debug("Consumer exiting.")
def handle_polling(self, directory):
def handle_polling(self, directory, recursive):
logging.getLogger(__name__).info(
f"Polling directory for changes: {directory}")
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
self.observer.schedule(Handler(), directory, recursive=False)
self.observer.schedule(Handler(), directory, recursive=recursive)
self.observer.start()
try:
while self.observer.is_alive():
@@ -132,18 +177,26 @@ class Command(BaseCommand):
self.observer.stop()
self.observer.join()
def handle_inotify(self, directory):
def handle_inotify(self, directory, recursive):
logging.getLogger(__name__).info(
f"Using inotify to watch directory for changes: {directory}")
inotify = INotify()
descriptor = inotify.add_watch(
directory, flags.CLOSE_WRITE | flags.MOVED_TO)
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO
if recursive:
descriptor = inotify.add_watch_recursive(directory, inotify_flags)
else:
descriptor = inotify.add_watch(directory, inotify_flags)
try:
while not self.stop_flag:
for event in inotify.read(timeout=1000, read_delay=1000):
file = os.path.join(directory, event.name)
_consume(file)
for event in inotify.read(timeout=1000):
if recursive:
path = inotify.get_path(event.wd)
else:
path = directory
filepath = os.path.join(path, event.name)
_consume(filepath)
except KeyboardInterrupt:
pass

View File

@@ -7,7 +7,8 @@ from django.core import serializers
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document, Correspondent, Tag, DocumentType
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME
from paperless.db import GnuPG
from ...mixins import Renderable
@@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand):
document = document_map[document_dict["pk"]]
unique_filename = f"{document.pk:07}_{document.file_name}"
file_target = os.path.join(self.target, unique_filename)
thumbnail_name = unique_filename + "-thumbnail.png"
@@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand):
document_dict[EXPORTER_FILE_NAME] = unique_filename
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
if os.path.exists(document.archive_path):
archive_name = \
f"{document.pk:07}_archive_{document.archive_file_name}"
archive_target = os.path.join(self.target, archive_name)
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
else:
archive_target = None
print(f"Exporting: {file_target}")
t = int(time.mktime(document.created.timetuple()))
@@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand):
f.write(GnuPG.decrypted(document.thumbnail_file))
os.utime(thumbnail_target, times=(t, t))
if archive_target:
with open(archive_target, "wb") as f:
f.write(GnuPG.decrypted(document.archive_path))
os.utime(archive_target, times=(t, t))
else:
shutil.copy(document.source_path, file_target)
shutil.copy(document.thumbnail_path, thumbnail_target)
if archive_target:
shutil.copy(document.archive_path, archive_target)
manifest += json.loads(
serializers.serialize("json", Correspondent.objects.all()))

View File

@@ -7,8 +7,8 @@ from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
from documents.models import Document
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
from paperless.db import GnuPG
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME
from ...file_handling import generate_filename, create_source_path_directory
from ...mixins import Renderable
@@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand):
'appear to be in the source directory.'.format(doc_file)
)
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
if not os.path.exists(os.path.join(self.source, archive_file)):
raise CommandError(
f"The manifest file refers to {archive_file} which "
f"does not appear to be in the source directory."
)
def _import_files_from_manifest(self):
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
for record in self.manifest:
if not record["model"] == "documents.document":
continue
doc_file = record[EXPORTER_FILE_NAME]
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]
document_path = os.path.join(self.source, doc_file)
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = os.path.join(self.source, thumb_file)
document.storage_type = storage_type
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = os.path.join(self.source, archive_file)
else:
archive_path = None
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.filename = generate_filename(document)
if os.path.isfile(document.source_path):
@@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand):
print(f"Moving {document_path} to {document.source_path}")
shutil.copy(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path)
if archive_path:
shutil.copy(archive_path, document.archive_path)
document.save()

View File

@@ -0,0 +1,23 @@
# Generated by Django 3.1.3 on 2020-11-29 00:48
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('documents', '1004_sanity_check_schedule'),
]
operations = [
migrations.AddField(
model_name='document',
name='archive_checksum',
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
),
migrations.AlterField(
model_name='document',
name='checksum',
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
),
]

View File

@@ -1,7 +1,6 @@
# coding=utf-8
import logging
import mimetypes
import os
import re
from collections import OrderedDict
@@ -12,6 +11,9 @@ from django.db import models
from django.utils import timezone
from django.utils.text import slugify
from documents.file_handling import archive_name_from_filename
from documents.parsers import get_default_file_extension
class MatchingModel(models.Model):
@@ -157,9 +159,15 @@ class Document(models.Model):
max_length=32,
editable=False,
unique=True,
help_text="The checksum of the original document (before it was "
"encrypted). We use this to prevent duplicate document "
"imports."
help_text="The checksum of the original document."
)
archive_checksum = models.CharField(
max_length=32,
editable=False,
blank=True,
null=True,
help_text="The checksum of the archived document."
)
created = models.DateTimeField(
@@ -198,7 +206,7 @@ class Document(models.Model):
ordering = ("correspondent", "title")
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
created = self.created.strftime("%Y%m%d")
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
@@ -224,14 +232,33 @@ class Document(models.Model):
def source_file(self):
return open(self.source_path, "rb")
@property
def archive_path(self):
if self.filename:
fname = archive_name_from_filename(self.filename)
else:
fname = "{:07}.pdf".format(self.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
@property
def archive_file(self):
return open(self.archive_path, "rb")
@property
def file_name(self):
return slugify(str(self)) + self.file_type
@property
def archive_file_name(self):
return slugify(str(self)) + ".pdf"
@property
def file_type(self):
# TODO: this is not stable across python versions
return mimetypes.guess_extension(str(self.mime_type))
return get_default_file_extension(self.mime_type)
@property
def thumbnail_path(self):

View File

@@ -1,4 +1,5 @@
import logging
import mimetypes
import os
import re
import shutil
@@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type):
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
if mime_type in supported_mime_types:
return supported_mime_types[mime_type]
ext = mimetypes.guess_extension(mime_type)
if ext:
return ext
else:
return ""
def is_file_ext_supported(ext):
if ext:
return ext.lower() in get_supported_file_extensions()
else:
return False
def get_supported_file_extensions():
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
return extensions
def get_parser_class_for_mime_type(mime_type):
options = []
@@ -107,21 +142,59 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def run_unpaper(pnm, logging_group=None):
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
def parse_date(filename, text):
"""
Returns the date of the document.
"""
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
pnm_out)
def __parser(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
logger.debug(f"Execute: {' '.join(command_args)}",
extra={'group': logging_group})
date = None
if not subprocess.Popen(command_args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL).wait() == 0:
raise ParseError(f"Unpaper failed at {command_args}")
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
return pnm_out
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
return date
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
break
else:
date = None
return date
class ParseError(Exception):
@@ -134,26 +207,35 @@ class DocumentParser(LoggingMixin):
`paperless_tesseract.parsers` for inspiration.
"""
def __init__(self, path, logging_group):
def __init__(self, logging_group):
super().__init__()
self.logging_group = logging_group
self.document_path = path
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
def get_thumbnail(self):
self.archive_path = None
self.text = None
self.date = None
def parse(self, document_path, mime_type):
raise NotImplementedError()
def get_archive_path(self):
return self.archive_path
def get_thumbnail(self, document_path, mime_type):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def optimise_thumbnail(self, in_path):
def get_optimised_thumbnail(self, document_path, mime_type):
thumbnail = self.get_thumbnail(document_path, mime_type)
if settings.OPTIMIZE_THUMBNAILS:
out_path = os.path.join(self.tempdir, "optipng.png")
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
args = (settings.OPTIPNG_BINARY,
"-silent", "-o5", in_path, "-out", out_path)
"-silent", "-o5", thumbnail, "-out", out_path)
self.log('debug', f"Execute: {' '.join(args)}")
@@ -162,97 +244,13 @@ class DocumentParser(LoggingMixin):
return out_path
else:
return in_path
def get_optimised_thumbnail(self):
return self.optimise_thumbnail(self.get_thumbnail())
return thumbnail
def get_text(self):
"""
Returns the text from the document and only the text.
"""
raise NotImplementedError()
return self.text
def get_date(self):
"""
Returns the date of the document.
"""
def __parser(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
date = None
date_string = None
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
title = os.path.basename(self.document_path)
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
self.log("info", "Checking document title for date")
for m in re.finditer(DATE_REGEX, title):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
self.log(
"info",
"Detected document date {} based on string {} "
"from document title"
"".format(date.isoformat(), date_string)
)
return date
try:
# getting text after checking filename will save time if only
# looking at the filename instead of the whole text
text = self.get_text()
except ParseError:
return None
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
break
else:
date = None
if date is not None:
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
date_string
)
)
else:
self.log("info", "Unable to detect date for document")
return date
return self.date
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))

View File

@@ -47,7 +47,7 @@ def check_sanity():
present_files.append(os.path.normpath(os.path.join(root, f)))
for doc in Document.objects.all():
# Check thumbnail
# Check sanity of the thumbnail
if not os.path.isfile(doc.thumbnail_path):
messages.append(SanityError(
f"Thumbnail of document {doc.pk} does not exist."))
@@ -61,26 +61,49 @@ def check_sanity():
f"Cannot read thumbnail file of document {doc.pk}: {e}"
))
# Check document
# Check sanity of the original file
# TODO: extract method
if not os.path.isfile(doc.source_path):
messages.append(SanityError(
f"Original of document {doc.pk} does not exist."))
else:
present_files.remove(os.path.normpath(doc.source_path))
checksum = None
try:
with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.append(SanityError(
f"Cannot read original file of document {doc.pk}: {e}"))
else:
if not checksum == doc.checksum:
messages.append(SanityError(
f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
if checksum and not checksum == doc.checksum:
# Check sanity of the archive file.
if doc.archive_checksum:
if not os.path.isfile(doc.archive_path):
messages.append(SanityError(
f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
f"Archived version of document {doc.pk} does not exist."
))
else:
present_files.remove(os.path.normpath(doc.archive_path))
try:
with doc.archive_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.append(SanityError(
f"Cannot read archive file of document {doc.pk}: {e}"
))
else:
if not checksum == doc.archive_checksum:
messages.append(SanityError(
f"Checksum mismatch of archive {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
# other document checks
if not doc.content:
messages.append(SanityWarning(
f"Document {doc.pk} has no content."

View File

@@ -1,6 +1,9 @@
import magic
from pathvalidate import validate_filename, ValidationError
from rest_framework import serializers
from .models import Correspondent, Tag, Document, Log, DocumentType
from .parsers import is_mime_type_supported
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
@@ -76,11 +79,9 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
class DocumentSerializer(serializers.ModelSerializer):
correspondent_id = CorrespondentField(
allow_null=True, source='correspondent')
tags_id = TagsField(many=True, source='tags')
document_type_id = DocumentTypeField(
allow_null=True, source='document_type')
correspondent = CorrespondentField(allow_null=True)
tags = TagsField(many=True)
document_type = DocumentTypeField(allow_null=True)
class Meta:
model = Document
@@ -88,13 +89,10 @@ class DocumentSerializer(serializers.ModelSerializer):
fields = (
"id",
"correspondent",
"correspondent_id",
"document_type",
"document_type_id",
"title",
"content",
"tags",
"tags_id",
"created",
"modified",
"added",
@@ -113,3 +111,84 @@ class LogSerializer(serializers.ModelSerializer):
"group",
"level"
)
class PostDocumentSerializer(serializers.Serializer):
document = serializers.FileField(
label="Document",
write_only=True,
)
title = serializers.CharField(
label="Title",
write_only=True,
required=False,
)
correspondent = serializers.PrimaryKeyRelatedField(
queryset=Correspondent.objects.all(),
label="Correspondent",
allow_null=True,
write_only=True,
required=False,
)
document_type = serializers.PrimaryKeyRelatedField(
queryset=DocumentType.objects.all(),
label="Document type",
allow_null=True,
write_only=True,
required=False,
)
tags = serializers.PrimaryKeyRelatedField(
many=True,
queryset=Tag.objects.all(),
label="Tags",
write_only=True,
required=False,
)
def validate(self, attrs):
document = attrs.get('document')
try:
validate_filename(document.name)
except ValidationError:
raise serializers.ValidationError("Invalid filename.")
document_data = document.file.read()
mime_type = magic.from_buffer(document_data, mime=True)
if not is_mime_type_supported(mime_type):
raise serializers.ValidationError(
"This mime type is not supported.")
attrs['document_data'] = document_data
title = attrs.get('title')
if not title:
attrs['title'] = None
correspondent = attrs.get('correspondent')
if correspondent:
attrs['correspondent_id'] = correspondent.id
else:
attrs['correspondent_id'] = None
document_type = attrs.get('document_type')
if document_type:
attrs['document_type_id'] = document_type.id
else:
attrs['document_type_id'] = None
tags = attrs.get('tags')
if tags:
tag_ids = [tag.id for tag in tags]
attrs['tag_ids'] = tag_ids
else:
attrs['tag_ids'] = None
return attrs

View File

@@ -2,3 +2,4 @@
# for exporting/importing commands
EXPORTER_FILE_NAME = "__exported_file_name__"
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"

View File

@@ -13,7 +13,7 @@ from rest_framework.reverse import reverse
from .. import index, matching
from ..file_handling import delete_empty_directories, generate_filename, \
create_source_path_directory
create_source_path_directory, archive_name_from_filename
from ..models import Document, Tag
@@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
for f in (instance.source_path, instance.thumbnail_path):
try:
os.unlink(f)
except FileNotFoundError:
pass # The file's already gone, so we're cool with it.
for f in (instance.source_path,
instance.archive_path,
instance.thumbnail_path):
if os.path.isfile(f):
try:
os.unlink(f)
logging.getLogger(__name__).debug(
f"Deleted file {f}.")
except OSError as e:
logging.getLogger(__name__).warning(
f"While deleting document {instance.file_name}, the file "
f"{f} could not be deleted: {e}"
)
delete_empty_directories(os.path.dirname(instance.source_path))
delete_empty_directories(
os.path.dirname(instance.source_path),
root=settings.ORIGINALS_DIR
)
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
)
def validate_move(instance, old_path, new_path):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal(
f"Document {str(instance)}: File {old_path} has gone.")
return False
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning(
f"Document {str(instance)}: Cannot rename file "
f"since target path {new_path} already exists.")
return False
return True
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@@ -183,55 +216,91 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
def update_filename_and_move_files(sender, instance, **kwargs):
if not instance.filename:
# Can't update the filename if there is not filename to begin with
# This happens after the consumer creates a new document.
# The PK needs to be set first by saving the document once. When this
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
# renamed anyway. In all other cases, instance.filename will be set.
# Can't update the filename if there is no filename to begin with
# This happens when the consumer creates a new document.
# The document is modified and saved multiple times, and only after
# everything is done (i.e., the generated filename is final),
# filename will be set to the location where the consumer has put
# the file.
#
# This will in turn cause this logic to move the file where it belongs.
return
old_filename = instance.filename
old_path = instance.source_path
new_filename = generate_filename(instance)
if new_filename == instance.filename:
# Don't do anything if its the same.
return
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
old_source_path = instance.source_path
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal(
f"Document {str(instance)}: File {old_path} has gone.")
if not validate_move(instance, old_source_path, new_source_path):
return
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning(
f"Document {str(instance)}: Cannot rename file "
f"since target path {new_path} already exists.")
return
# archive files are optional, archive checksum tells us if we have one,
# since this is None for documents without archived files.
if instance.archive_checksum:
new_archive_filename = archive_name_from_filename(new_filename)
old_archive_path = instance.archive_path
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
new_archive_filename)
create_source_path_directory(new_path)
if not validate_move(instance, old_archive_path, new_archive_path):
return
create_source_path_directory(new_archive_path)
else:
old_archive_path = None
new_archive_path = None
create_source_path_directory(new_source_path)
try:
os.rename(old_path, new_path)
os.rename(old_source_path, new_source_path)
if instance.archive_checksum:
os.rename(old_archive_path, new_archive_path)
instance.filename = new_filename
# Don't save here to prevent infinite recursion.
Document.objects.filter(pk=instance.pk).update(filename=new_filename)
logging.getLogger(__name__).debug(
f"Moved file {old_path} to {new_path}.")
f"Moved file {old_source_path} to {new_source_path}.")
if instance.archive_checksum:
logging.getLogger(__name__).debug(
f"Moved file {old_archive_path} to {new_archive_path}.")
except OSError as e:
instance.filename = old_filename
# this happens when we can't move a file. If that's the case for the
# archive file, we try our best to revert the changes.
try:
os.rename(new_source_path, old_source_path)
os.rename(new_archive_path, old_archive_path)
except Exception as e:
# This is fine, since:
# A: if we managed to move source from A to B, we will also manage
# to move it from B to A. If not, we have a serious issue
# that's going to get caught by the santiy checker.
# all files remain in place and will never be overwritten,
# so this is not the end of the world.
# B: if moving the orignal file failed, nothing has changed anyway.
pass
except DatabaseError as e:
os.rename(new_path, old_path)
os.rename(new_source_path, old_source_path)
if instance.archive_checksum:
os.rename(new_archive_path, old_archive_path)
instance.filename = old_filename
if not os.path.isfile(old_path):
delete_empty_directories(os.path.dirname(old_path))
if not os.path.isfile(old_source_path):
delete_empty_directories(os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR)
if old_archive_path and not os.path.isfile(old_archive_path):
delete_empty_directories(os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR)
def set_log_entry(sender, document=None, logging_group=None, **kwargs):

View File

@@ -13,8 +13,8 @@ from documents.sanity_checker import SanityFailedError
def index_optimize():
ix = index.open_index()
with AsyncWriter(ix) as writer:
writer.commit(optimize=True)
writer = AsyncWriter(ix)
writer.commit(optimize=True)
def index_reindex():

View File

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View File

@@ -12,10 +12,10 @@ from documents.models import Document, Correspondent, DocumentType, Tag
from documents.tests.utils import DirectoriesMixin
class DocumentApiTest(DirectoriesMixin, APITestCase):
class TestDocumentApi(DirectoriesMixin, APITestCase):
def setUp(self):
super(DocumentApiTest, self).setUp()
super(TestDocumentApi, self).setUp()
user = User.objects.create_superuser(username="temp_admin")
self.client.force_login(user=user)
@@ -41,20 +41,13 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
returned_doc = response.data['results'][0]
self.assertEqual(returned_doc['id'], doc.id)
self.assertEqual(returned_doc['title'], doc.title)
self.assertEqual(returned_doc['correspondent']['name'], c.name)
self.assertEqual(returned_doc['document_type']['name'], dt.name)
self.assertEqual(returned_doc['correspondent']['id'], c.id)
self.assertEqual(returned_doc['document_type']['id'], dt.id)
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
self.assertEqual(len(returned_doc['tags']), 1)
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
self.assertListEqual(returned_doc['tags_id'], [tag.id])
self.assertEqual(returned_doc['correspondent'], c.id)
self.assertEqual(returned_doc['document_type'], dt.id)
self.assertListEqual(returned_doc['tags'], [tag.id])
c2 = Correspondent.objects.create(name="c2")
returned_doc['correspondent_id'] = c2.pk
returned_doc['correspondent'] = c2.pk
returned_doc['title'] = "the new title"
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
@@ -100,6 +93,44 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content_thumbnail)
def test_download_with_archive(self):
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
content = b"This is a test"
content_archive = b"This is the same test but archived"
with open(filename, "wb") as f:
f.write(content)
filename = os.path.basename(filename)
doc = Document.objects.create(title="none", filename=filename,
mime_type="application/pdf")
with open(doc.archive_path, "wb") as f:
f.write(content_archive)
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content_archive)
response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content)
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content_archive)
response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
self.assertEqual(response.status_code, 200)
self.assertEqual(response.content, content)
def test_document_actions_not_existing_file(self):
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
@@ -289,6 +320,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 10)
def test_search_spelling_correction(self):
with AsyncWriter(index.open_index()) as writer:
for i in range(55):
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
index.update_document(writer, doc)
response = self.client.get("/api/search/?query=thing")
correction = response.data['corrected_query']
self.assertEqual(correction, "things")
response = self.client.get("/api/search/?query=things")
correction = response.data['corrected_query']
self.assertEqual(correction, None)
def test_statistics(self):
doc1 = Document.objects.create(title="none1", checksum="A")
@@ -304,7 +351,7 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.data['documents_total'], 3)
self.assertEqual(response.data['documents_inbox'], 1)
@mock.patch("documents.forms.async_task")
@mock.patch("documents.views.async_task")
def test_upload(self, m):
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
@@ -316,8 +363,12 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
args, kwargs = m.call_args
self.assertEqual(kwargs['override_filename'], "simple.pdf")
self.assertIsNone(kwargs['override_title'])
self.assertIsNone(kwargs['override_correspondent_id'])
self.assertIsNone(kwargs['override_document_type_id'])
self.assertIsNone(kwargs['override_tag_ids'])
@mock.patch("documents.forms.async_task")
@mock.patch("documents.views.async_task")
def test_upload_invalid_form(self, m):
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
@@ -325,7 +376,7 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
m.assert_not_called()
@mock.patch("documents.forms.async_task")
@mock.patch("documents.views.async_task")
def test_upload_invalid_file(self, m):
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
@@ -333,8 +384,8 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
m.assert_not_called()
@mock.patch("documents.forms.async_task")
@mock.patch("documents.forms.validate_filename")
@mock.patch("documents.views.async_task")
@mock.patch("documents.serialisers.validate_filename")
def test_upload_invalid_filename(self, validate_filename, async_task):
validate_filename.side_effect = ValidationError()
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
@@ -342,3 +393,85 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
def test_upload_with_title(self, async_task):
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post("/api/documents/post_document/", {"document": f, "title": "my custom title"})
self.assertEqual(response.status_code, 200)
async_task.assert_called_once()
args, kwargs = async_task.call_args
self.assertEqual(kwargs['override_title'], "my custom title")
@mock.patch("documents.views.async_task")
def test_upload_with_correspondent(self, async_task):
c = Correspondent.objects.create(name="test-corres")
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": c.id})
self.assertEqual(response.status_code, 200)
async_task.assert_called_once()
args, kwargs = async_task.call_args
self.assertEqual(kwargs['override_correspondent_id'], c.id)
@mock.patch("documents.views.async_task")
def test_upload_with_invalid_correspondent(self, async_task):
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": 3456})
self.assertEqual(response.status_code, 400)
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
def test_upload_with_document_type(self, async_task):
dt = DocumentType.objects.create(name="invoice")
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": dt.id})
self.assertEqual(response.status_code, 200)
async_task.assert_called_once()
args, kwargs = async_task.call_args
self.assertEqual(kwargs['override_document_type_id'], dt.id)
@mock.patch("documents.views.async_task")
def test_upload_with_invalid_document_type(self, async_task):
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": 34578})
self.assertEqual(response.status_code, 400)
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
def test_upload_with_tags(self, async_task):
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post(
"/api/documents/post_document/",
{"document": f, "tags": [t2.id, t1.id]})
self.assertEqual(response.status_code, 200)
async_task.assert_called_once()
args, kwargs = async_task.call_args
self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id])
@mock.patch("documents.views.async_task")
def test_upload_with_invalid_tags(self, async_task):
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
response = self.client.post(
"/api/documents/post_document/",
{"document": f, "tags": [t2.id, t1.id, 734563]})
self.assertEqual(response.status_code, 400)
async_task.assert_not_called()

View File

@@ -1,5 +1,6 @@
import os
import re
import shutil
import tempfile
from unittest import mock
from unittest.mock import MagicMock
@@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):
class DummyParser(DocumentParser):
def get_thumbnail(self):
def get_thumbnail(self, document_path, mime_type):
# not important during tests
raise NotImplementedError()
def __init__(self, path, logging_group, scratch_dir):
super(DummyParser, self).__init__(path, logging_group)
def __init__(self, logging_group, scratch_dir, archive_path):
super(DummyParser, self).__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
self.archive_path = archive_path
def get_optimised_thumbnail(self):
def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb
def get_text(self):
return "The Text"
def parse(self, document_path, mime_type):
self.text = "The Text"
class FaultyParser(DocumentParser):
def get_thumbnail(self):
def get_thumbnail(self, document_path, mime_type):
# not important during tests
raise NotImplementedError()
def __init__(self, path, logging_group, scratch_dir):
super(FaultyParser, self).__init__(path, logging_group)
def __init__(self, logging_group, scratch_dir):
super(FaultyParser, self).__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
def get_optimised_thumbnail(self):
def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb
def get_text(self):
def parse(self, document_path, mime_type):
raise ParseError("Does not compute.")
@@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(DirectoriesMixin, TestCase):
def make_dummy_parser(self, path, logging_group):
return DummyParser(path, logging_group, self.dirs.scratch_dir)
def make_dummy_parser(self, logging_group):
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
def make_faulty_parser(self, path, logging_group):
return FaultyParser(path, logging_group, self.dirs.scratch_dir)
def make_faulty_parser(self, logging_group):
return FaultyParser(logging_group, self.dirs.scratch_dir)
def setUp(self):
super(TestConsumer, self).setUp()
@@ -423,7 +425,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
m = patcher.start()
m.return_value = [(None, {
"parser": self.make_dummy_parser,
"mime_types": ["application/pdf"],
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]
@@ -432,9 +434,18 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.consumer = Consumer()
def get_test_file(self):
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
return f
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
shutil.copy(src, dst)
return dst
def get_test_archive_file(self):
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
shutil.copy(src, dst)
return dst
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def testNormalOperation(self):
filename = self.get_test_file()
@@ -454,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
document.thumbnail_path
))
self.assertTrue(os.path.isfile(
document.archive_path
))
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertFalse(os.path.isfile(filename))
def testOverrideFilename(self):
@@ -501,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.fail("Should throw exception")
def testDuplicates(self):
def testDuplicates1(self):
self.consumer.try_consume_file(self.get_test_file())
try:
@@ -512,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.fail("Should throw exception")
def testDuplicates2(self):
self.consumer.try_consume_file(self.get_test_file())
try:
self.consumer.try_consume_file(self.get_test_archive_file())
except ConsumerError as e:
self.assertTrue(str(e).endswith("It is a duplicate."))
return
self.fail("Should throw exception")
def testDuplicates3(self):
self.consumer.try_consume_file(self.get_test_archive_file())
self.consumer.try_consume_file(self.get_test_file())
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testNoParsers(self, m):
m.return_value = []
@@ -519,7 +552,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
try:
self.consumer.try_consume_file(self.get_test_file())
except ConsumerError as e:
self.assertTrue(str(e).startswith("No parsers abvailable"))
self.assertTrue("No parsers abvailable for" in str(e))
return
self.fail("Should throw exception")
@@ -528,7 +561,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testFaultyParser(self, m):
m.return_value = [(None, {
"parser": self.make_faulty_parser,
"mime_types": ["application/pdf"],
"mime_types": {"application/pdf": ".pdf"},
"weight": 0
})]

View File

@@ -0,0 +1,140 @@
import datetime
import os
import shutil
from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase, override_settings
from documents.parsers import parse_date
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestDate(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
def setUp(self):
os.makedirs(self.SCRATCH, exist_ok=True)
def tearDown(self):
shutil.rmtree(self.SCRATCH)
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(parse_date("", text), None)
def test_date_format_2(self):
text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(parse_date("", text), None)
def test_date_format_3(self):
text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(parse_date("", text), None)
def test_date_format_4(self):
text = "lorem ipsum 13.02.2018 lorem ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
def test_date_format_5(self):
text = (
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
"ipsum"
)
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
def test_date_format_6(self):
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum"
)
self.assertEqual(parse_date("", text), None)
def test_date_format_7(self):
text = (
"lorem ipsum\n"
"März 2019\n"
"lorem ipsum"
)
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(
2019, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
def test_date_format_8(self):
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020"
)
self.assertEqual(
parse_date("", text),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_9(self):
text = (
"lorem ipsum\n"
"27. Nullmonth 2020\n"
"März 2020\n"
"lorem ipsum"
)
self.assertEqual(
parse_date("", text),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
def test_crazy_date_past(self, *args):
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
def test_crazy_date_future(self, *args):
self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475"))
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_invalid(self, *args):
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))

View File

@@ -1,12 +1,29 @@
import shutil
import tempfile
from datetime import datetime
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import TestCase, override_settings
from ..models import Document, Correspondent
class TestDocument(TestCase):
def setUp(self) -> None:
self.originals_dir = tempfile.mkdtemp()
self.thumb_dir = tempfile.mkdtemp()
override_settings(
ORIGINALS_DIR=self.originals_dir,
THUMBNAIL_DIR=self.thumb_dir,
).enable()
def tearDown(self) -> None:
shutil.rmtree(self.originals_dir)
shutil.rmtree(self.thumb_dir)
def test_file_deletion(self):
document = Document.objects.create(
correspondent=Correspondent.objects.create(name="Test0"),
@@ -19,8 +36,31 @@ class TestDocument(TestCase):
file_path = document.source_path
thumb_path = document.thumbnail_path
Path(file_path).touch()
Path(thumb_path).touch()
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
mock_unlink.assert_any_call(file_path)
mock_unlink.assert_any_call(thumb_path)
self.assertEqual(mock_unlink.call_count, 2)
def test_file_name(self):
doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25))
self.assertEqual(doc.file_name, "20201225-test.pdf")
def test_file_name_jpg(self):
doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25))
self.assertEqual(doc.file_name, "20201225-test.jpg")
def test_file_name_unknown(self):
doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25))
self.assertEqual(doc.file_name, "20201225-test.zip")
def test_file_name_invalid(self):
doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25))
self.assertEqual(doc.file_name, "20201225-test")

View File

@@ -2,32 +2,17 @@ import os
import shutil
from pathlib import Path
from unittest import mock
from uuid import uuid4
from django.conf import settings
from django.db import DatabaseError
from django.test import TestCase, override_settings
from .utils import DirectoriesMixin
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
from ..models import Document, Correspondent
class TestDate(TestCase):
deletion_list = []
def add_to_deletion_list(self, dirname):
self.deletion_list.append(dirname)
def setUp(self):
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
os.makedirs(folder + "/documents/originals")
override_settings(MEDIA_ROOT=folder).enable()
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
self.add_to_deletion_list(folder)
def tearDown(self):
for dirname in self.deletion_list:
shutil.rmtree(dirname, ignore_errors=True)
class TestFileHandling(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self):
@@ -104,7 +89,7 @@ class TestDate(TestCase):
document.save()
# Check proper handling of files
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@@ -140,7 +125,7 @@ class TestDate(TestCase):
# Check proper handling of files
self.assertTrue(os.path.isfile(document.source_path))
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
@@ -196,8 +181,8 @@ class TestDate(TestCase):
document.save()
# Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
self.assertTrue(os.path.isfile(important_file))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
@@ -315,13 +300,12 @@ class TestDate(TestCase):
# Create our working directory
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
os.makedirs(tmp)
self.add_to_deletion_list(tmp)
os.makedirs(os.path.join(tmp, "notempty"))
Path(os.path.join(tmp, "notempty", "file")).touch()
os.makedirs(os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True)
@@ -345,3 +329,159 @@ class TestDate(TestCase):
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def test_create_no_format(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_create_with_format(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_archive_gone(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
#Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertFalse(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_archive_exists(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.os.rename")
def test_move_archive_error(self, m):
def fake_rename(src, dst):
if "archive" in src:
raise OSError()
else:
os.remove(src)
Path(dst).touch()
m.side_effect = fake_rename
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_file_gone(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
#Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertFalse(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertFalse(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.os.rename")
def test_move_file_error(self, m):
def fake_rename(src, dst):
if "original" in src:
raise OSError()
else:
os.remove(src)
Path(dst).touch()
m.side_effect = fake_rename
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
def test_archive_deleted(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
doc.delete()
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertFalse(os.path.isfile(doc.source_path))
self.assertFalse(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_database_error(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
m.side_effect = DatabaseError()
doc.save()
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))

View File

@@ -0,0 +1,42 @@
import filecmp
import os
import shutil
from django.core.management import call_command
from django.test import TestCase
from documents.management.commands.document_archiver import handle_document
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
class TestArchiver(DirectoriesMixin, TestCase):
def make_models(self):
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
#self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
#self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
def test_archiver(self):
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
self.make_models()
call_command('document_archiver')
def test_handle_document(self):
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
self.make_models()
handle_document(self.d1.pk)
doc = Document.objects.get(id=self.d1.id)
self.assertIsNotNone(doc.checksum)
self.assertTrue(os.path.isfile(doc.archive_path))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))

View File

@@ -7,8 +7,9 @@ from unittest import mock
from django.conf import settings
from django.core.management import call_command, CommandError
from django.test import override_settings, TestCase
from django.test import override_settings, TransactionTestCase
from documents.models import Tag
from documents.consumer import ConsumerError
from documents.management.commands import document_consumer
from documents.tests.utils import DirectoriesMixin
@@ -33,12 +34,12 @@ def chunked(size, source):
yield source[i:i+size]
class TestConsumer(DirectoriesMixin, TestCase):
class ConsumerMixin:
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
def setUp(self) -> None:
super(TestConsumer, self).setUp()
super(ConsumerMixin, self).setUp()
self.t = None
patcher = mock.patch("documents.management.commands.document_consumer.async_task")
self.task_mock = patcher.start()
@@ -57,7 +58,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
# wait for the consumer to exit.
self.t.join()
super(TestConsumer, self).tearDown()
super(ConsumerMixin, self).tearDown()
def wait_for_task_mock_call(self):
n = 0
@@ -68,7 +69,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
return
n += 1
sleep(0.1)
self.fail("async_task was never called")
# A bogus async_task that will simply check the file for
# completeness and raise an exception otherwise.
@@ -95,6 +95,9 @@ class TestConsumer(DirectoriesMixin, TestCase):
sleep(0.1)
print("file completed.")
class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
def test_consume_file(self):
self.t_start()
@@ -108,9 +111,15 @@ class TestConsumer(DirectoriesMixin, TestCase):
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
@override_settings(CONSUMER_POLLING=1)
def test_consume_file_polling(self):
self.test_consume_file()
def test_consume_file_invalid_ext(self):
self.t_start()
f = os.path.join(self.dirs.consumption_dir, "my_file.wow")
shutil.copy(self.sample_file, f)
self.wait_for_task_mock_call()
self.task_mock.assert_not_called()
def test_consume_existing_file(self):
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
@@ -122,10 +131,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
@override_settings(CONSUMER_POLLING=1)
def test_consume_existing_file_polling(self):
self.test_consume_existing_file()
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_pdf(self, error_logger):
@@ -146,10 +151,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname)
@override_settings(CONSUMER_POLLING=1)
def test_slow_write_pdf_polling(self):
self.test_slow_write_pdf()
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_and_move(self, error_logger):
@@ -172,10 +173,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
error_logger.assert_not_called()
@override_settings(CONSUMER_POLLING=1)
def test_slow_write_and_move_polling(self):
self.test_slow_write_and_move()
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_incomplete(self, error_logger):
@@ -195,10 +192,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
# assert that we have an error logged with this invalid file.
error_logger.assert_called_once()
@override_settings(CONSUMER_POLLING=1)
def test_slow_write_incomplete_polling(self):
self.test_slow_write_incomplete()
@override_settings(CONSUMPTION_DIR="does_not_exist")
def test_consumption_directory_invalid(self):
@@ -208,3 +201,62 @@ class TestConsumer(DirectoriesMixin, TestCase):
def test_consumption_directory_unset(self):
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
@override_settings(CONSUMER_POLLING=1)
class TestConsumerPolling(TestConsumer):
# just do all the tests with polling
pass
@override_settings(CONSUMER_RECURSIVE=True)
class TestConsumerRecursive(TestConsumer):
# just do all the tests with recursive
pass
@override_settings(CONSUMER_RECURSIVE=True)
@override_settings(CONSUMER_POLLING=1)
class TestConsumerRecursivePolling(TestConsumer):
# just do all the tests with polling and recursive
pass
class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(CONSUMER_RECURSIVE=True)
@override_settings(CONSUMER_SUBDIRS_AS_TAGS=True)
def test_consume_file_with_path_tags(self):
tag_names = ("existingTag", "Space Tag")
# Create a Tag prior to consuming a file using it in path
tag_ids = [Tag.objects.create(name=tag_names[0]).pk,]
self.t_start()
path = os.path.join(self.dirs.consumption_dir, *tag_names)
os.makedirs(path, exist_ok=True)
f = os.path.join(path, "my_file.pdf")
# Wait at least inotify read_delay for recursive watchers
# to be created for the new directories
sleep(1)
shutil.copy(self.sample_file, f)
self.wait_for_task_mock_call()
self.task_mock.assert_called_once()
# Add the pk of the Tag created by _consume()
tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
# assertCountEqual has a bad name, but test that the first
# sequence contains the same elements as second, regardless of
# their order.
self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
@override_settings(CONSUMER_POLLING=1)
def test_consume_file_with_path_tags_polling(self):
self.test_consume_file_with_path_tags()

View File

@@ -17,7 +17,8 @@ class TestDecryptDocuments(TestCase):
@override_settings(
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
PASSPHRASE="test"
PASSPHRASE="test",
PAPERLESS_FILENAME_FORMAT=None
)
@mock.patch("documents.management.commands.decrypt_documents.input")
def test_decrypt(self, m):

View File

@@ -9,10 +9,11 @@ from django.test import TestCase, override_settings
from documents.management.commands import document_exporter
from documents.models import Document, Tag, DocumentType, Correspondent
from documents.tests.utils import DirectoriesMixin
from documents.sanity_checker import check_sanity
from documents.tests.utils import DirectoriesMixin, paperless_environment
class TestExporter(DirectoriesMixin, TestCase):
class TestExportImport(DirectoriesMixin, TestCase):
@override_settings(
PASSPHRASE="test"
@@ -23,11 +24,8 @@ class TestExporter(DirectoriesMixin, TestCase):
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
with open(file, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
Tag.objects.create(name="t")
DocumentType.objects.create(name="dt")
Correspondent.objects.create(name="c")
@@ -51,6 +49,23 @@ class TestExporter(DirectoriesMixin, TestCase):
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element['fields']['checksum'])
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
self.assertTrue(os.path.exists(fname))
with open(fname, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element['fields']['archive_checksum'])
with paperless_environment() as dirs:
call_command('document_importer', target)
messages = check_sanity()
# everything is alright after the test
self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
def test_export_missing_files(self):
target = tempfile.mkdtemp()
call_command('document_exporter', target)
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)

View File

@@ -1,10 +1,15 @@
import os
import shutil
import tempfile
from tempfile import TemporaryDirectory
from unittest import mock
from django.test import TestCase
from django.test import TestCase, override_settings
from documents.parsers import get_parser_class
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
get_parser_class_for_mime_type, DocumentParser, is_file_ext_supported
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
def fake_magic_from_file(file, mime=False):
@@ -27,7 +32,7 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
(None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
)
self.assertEqual(
@@ -45,8 +50,8 @@ class TestParserDiscovery(TestCase):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
(None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
)
self.assertEqual(
@@ -61,3 +66,57 @@ class TestParserDiscovery(TestCase):
self.assertIsNone(
get_parser_class("doc.pdf")
)
def fake_get_thumbnail(self, path, mimetype):
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
class TestBaseParser(TestCase):
def setUp(self) -> None:
self.scratch = tempfile.mkdtemp()
override_settings(
SCRATCH_DIR=self.scratch
).enable()
def tearDown(self) -> None:
shutil.rmtree(self.scratch)
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=True)
def test_get_optimised_thumbnail(self):
parser = DocumentParser(None)
parser.get_optimised_thumbnail("any", "not important")
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=False)
def test_get_optimised_thumb_disabled(self):
parser = DocumentParser(None)
path = parser.get_optimised_thumbnail("any", "not important")
self.assertEqual(path, fake_get_thumbnail(None, None, None))
class TestParserAvailability(TestCase):
def test_file_extensions(self):
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
self.assertIn(ext, get_supported_file_extensions())
self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
self.assertEqual(get_default_file_extension('image/png'), ".png")
self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
self.assertEqual(get_default_file_extension('text/plain'), ".txt")
self.assertEqual(get_default_file_extension('text/csv'), ".csv")
self.assertEqual(get_default_file_extension('application/zip'), ".zip")
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
self.assertTrue(is_file_ext_supported('.pdf'))
self.assertFalse(is_file_ext_supported('.hsdfh'))

View File

@@ -32,7 +32,7 @@ class PostConsumeTestCase(TestCase):
@mock.patch("documents.signals.handlers.Popen")
@override_settings(POST_CONSUME_SCRIPT="script")
def test_post_consume_script_simple(self, m):
def test_post_consume_script_with_correspondent(self, m):
c = Correspondent.objects.create(name="my_bank")
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
tag1 = Tag.objects.create(name="a")
@@ -53,5 +53,4 @@ class PostConsumeTestCase(TestCase):
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
# TODO: tags are unordered by default.
self.assertEqual(command[8], "a,b")
self.assertCountEqual(command[8].split(","), ["a", "b"])

View File

@@ -0,0 +1,87 @@
import os
import shutil
from pathlib import Path
from django.test import TestCase
from documents.models import Document
from documents.sanity_checker import check_sanity, SanityFailedError
from documents.tests.utils import DirectoriesMixin
class TestSanityCheck(DirectoriesMixin, TestCase):
def make_test_data(self):
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
def test_no_docs(self):
self.assertEqual(len(check_sanity()), 0)
def test_success(self):
self.make_test_data()
self.assertEqual(len(check_sanity()), 0)
def test_no_thumbnail(self):
doc = self.make_test_data()
os.remove(doc.thumbnail_path)
self.assertEqual(len(check_sanity()), 1)
def test_thumbnail_no_access(self):
doc = self.make_test_data()
os.chmod(doc.thumbnail_path, 0o000)
self.assertEqual(len(check_sanity()), 1)
os.chmod(doc.thumbnail_path, 0o777)
def test_no_original(self):
doc = self.make_test_data()
os.remove(doc.source_path)
self.assertEqual(len(check_sanity()), 1)
def test_original_no_access(self):
doc = self.make_test_data()
os.chmod(doc.source_path, 0o000)
self.assertEqual(len(check_sanity()), 1)
os.chmod(doc.source_path, 0o777)
def test_original_checksum_mismatch(self):
doc = self.make_test_data()
doc.checksum = "WOW"
doc.save()
self.assertEqual(len(check_sanity()), 1)
def test_no_archive(self):
doc = self.make_test_data()
os.remove(doc.archive_path)
self.assertEqual(len(check_sanity()), 1)
def test_archive_no_access(self):
doc = self.make_test_data()
os.chmod(doc.archive_path, 0o000)
self.assertEqual(len(check_sanity()), 1)
os.chmod(doc.archive_path, 0o777)
def test_archive_checksum_mismatch(self):
doc = self.make_test_data()
doc.archive_checksum = "WOW"
doc.save()
self.assertEqual(len(check_sanity()), 1)
def test_empty_content(self):
doc = self.make_test_data()
doc.content = ""
doc.save()
self.assertEqual(len(check_sanity()), 1)
def test_orphaned_file(self):
doc = self.make_test_data()
Path(self.dirs.originals_dir, "orphaned").touch()
self.assertEqual(len(check_sanity()), 1)
def test_all(self):
Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
string = str(SanityFailedError(check_sanity()))

View File

@@ -0,0 +1,24 @@
from datetime import datetime
from django.test import TestCase
from django.utils import timezone
from documents import tasks
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
class TestTasks(DirectoriesMixin, TestCase):
def test_index_reindex(self):
Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
tasks.index_reindex()
def test_index_optimize(self):
Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
tasks.index_optimize()
def test_train_classifier(self):
tasks.train_classifier()

View File

@@ -2,6 +2,7 @@ import os
import shutil
import tempfile
from collections import namedtuple
from contextlib import contextmanager
from django.test import override_settings
@@ -17,22 +18,26 @@ def setup_directories():
dirs.index_dir = os.path.join(dirs.data_dir, "index")
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
os.makedirs(dirs.index_dir, exist_ok=True)
os.makedirs(dirs.originals_dir, exist_ok=True)
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
os.makedirs(dirs.archive_dir, exist_ok=True)
override_settings(
dirs.settings_override = override_settings(
DATA_DIR=dirs.data_dir,
SCRATCH_DIR=dirs.scratch_dir,
MEDIA_ROOT=dirs.media_dir,
ORIGINALS_DIR=dirs.originals_dir,
THUMBNAIL_DIR=dirs.thumbnail_dir,
ARCHIVE_DIR=dirs.archive_dir,
CONSUMPTION_DIR=dirs.consumption_dir,
INDEX_DIR=dirs.index_dir,
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
).enable()
)
dirs.settings_override.enable()
return dirs
@@ -42,6 +47,18 @@ def remove_dirs(dirs):
shutil.rmtree(dirs.data_dir, ignore_errors=True)
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
dirs.settings_override.disable()
@contextmanager
def paperless_environment():
dirs = None
try:
dirs = setup_directories()
yield dirs
finally:
if dirs:
remove_dirs(dirs)
class DirectoriesMixin:

View File

@@ -1,8 +1,16 @@
import os
import tempfile
from datetime import datetime
from time import mktime
from django.conf import settings
from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest, Http404
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django_q.tasks import async_task
from rest_framework import parsers
from rest_framework.decorators import action
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.mixins import (
@@ -31,14 +39,14 @@ from .filters import (
DocumentTypeFilterSet,
LogFilterSet
)
from .forms import UploadForm
from .models import Correspondent, Document, Log, Tag, DocumentType
from .serialisers import (
CorrespondentSerializer,
DocumentSerializer,
LogSerializer,
TagSerializer,
DocumentTypeSerializer
DocumentTypeSerializer,
PostDocumentSerializer
)
@@ -131,29 +139,32 @@ class DocumentViewSet(RetrieveModelMixin,
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
def file_response(self, pk, disposition):
@staticmethod
def original_requested(request):
return (
'original' in request.query_params and
request.query_params['original'] == 'true'
)
def file_response(self, pk, request, disposition):
doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
file_handle = doc.archive_file
filename = doc.archive_file_name
mime_type = 'application/pdf'
else:
file_handle = doc.source_file
else:
file_handle = GnuPG.decrypted(doc.source_file)
filename = doc.file_name
mime_type = doc.mime_type
response = HttpResponse(file_handle, content_type=doc.mime_type)
if doc.storage_type == Document.STORAGE_TYPE_GPG:
file_handle = GnuPG.decrypted(file_handle)
response = HttpResponse(file_handle, content_type=mime_type)
response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, doc.file_name)
disposition, filename)
return response
@action(methods=['post'], detail=False)
def post_document(self, request, pk=None):
# TODO: is this a good implementation?
form = UploadForm(data=request.POST, files=request.FILES)
if form.is_valid():
form.save()
return Response("OK")
else:
return HttpResponseBadRequest(str(form.errors))
@action(methods=['post'], detail=False)
def bulk_edit(self, request, pk=None):
try:
@@ -169,6 +180,8 @@ class DocumentViewSet(RetrieveModelMixin,
"paperless__checksum": doc.checksum,
"paperless__mime_type": doc.mime_type,
"paperless__filename": doc.filename,
"paperless__has_archive_version":
os.path.isfile(doc.archive_path)
})
except Document.DoesNotExist:
raise Http404()
@@ -176,7 +189,8 @@ class DocumentViewSet(RetrieveModelMixin,
@action(methods=['get'], detail=True)
def preview(self, request, pk=None):
try:
response = self.file_response(pk, "inline")
response = self.file_response(
pk, request, "inline")
return response
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
@@ -193,7 +207,8 @@ class DocumentViewSet(RetrieveModelMixin,
@action(methods=['get'], detail=True)
def download(self, request, pk=None):
try:
return self.file_response(pk, "attachment")
return self.file_response(
pk, request, "attachment")
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
@@ -210,6 +225,56 @@ class LogViewSet(ReadOnlyModelViewSet):
ordering_fields = ("created",)
class PostDocumentView(APIView):
permission_classes = (IsAuthenticated,)
serializer_class = PostDocumentSerializer
parser_classes = (parsers.MultiPartParser,)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
document = serializer.validated_data['document']
document_data = serializer.validated_data['document_data']
correspondent_id = serializer.validated_data['correspondent_id']
document_type_id = serializer.validated_data['document_type_id']
tag_ids = serializer.validated_data['tag_ids']
title = serializer.validated_data['title']
t = int(mktime(datetime.now().timetuple()))
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
dir=settings.SCRATCH_DIR,
delete=False) as f:
f.write(document_data)
os.utime(f.name, times=(t, t))
async_task("documents.tasks.consume_file",
f.name,
override_filename=document.name,
override_title=title,
override_correspondent_id=correspondent_id,
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_name=os.path.basename(document.name)[:100])
return Response("OK")
class SearchView(APIView):
permission_classes = (IsAuthenticated,)
@@ -229,30 +294,34 @@ class SearchView(APIView):
}
def get(self, request, format=None):
if 'query' in request.query_params:
query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
with index.query_page(self.ix, query, page) as result_page:
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'results': list(map(self.add_infos_to_hit, result_page))})
else:
if 'query' not in request.query_params:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
'results': []})
query = request.query_params['query']
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
if page < 1:
page = 1
try:
with index.query_page(self.ix, query, page) as (result_page,
corrected_query):
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'corrected_query': corrected_query,
'results': list(map(self.add_infos_to_hit, result_page))})
except Exception as e:
return HttpResponseBadRequest(str(e))
class SearchAutoCompleteView(APIView):

View File

@@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
binaries = (
settings.CONVERT_BINARY,
settings.OPTIPNG_BINARY,
settings.UNPAPER_BINARY,
"tesseract"
)

View File

@@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta
MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
@@ -85,6 +86,7 @@ INSTALLED_APPS = [
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
"django_filters",
"django_q",
@@ -94,7 +96,8 @@ INSTALLED_APPS = [
REST_FRAMEWORK = {
'DEFAULT_AUTHENTICATION_CLASSES': [
'rest_framework.authentication.BasicAuthentication',
'rest_framework.authentication.SessionAuthentication'
'rest_framework.authentication.SessionAuthentication',
'rest_framework.authentication.TokenAuthentication'
]
}
@@ -255,26 +258,43 @@ DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
LOGGING = {
"version": 1,
"disable_existing_loggers": False,
'formatters': {
'verbose': {
'format': '{levelname} {asctime} {module} {message}',
'style': '{',
},
'simple': {
'format': '{levelname} {message}',
'style': '{',
},
},
"handlers": {
"dbhandler": {
"db": {
"level": "DEBUG",
"class": "documents.loggers.PaperlessHandler",
},
"streamhandler": {
"class": "logging.StreamHandler"
"console": {
"level": "INFO",
"class": "logging.StreamHandler",
"formatter": "verbose",
}
},
"root": {
"handlers": ["console"],
"level": "DEBUG",
},
"loggers": {
"documents": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
"handlers": ["db"],
"propagate": True,
},
"paperless_mail": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
"handlers": ["db"],
"propagate": True,
},
"paperless_tesseract": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
"handlers": ["db"],
"propagate": True,
},
},
}
@@ -331,6 +351,10 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
@@ -339,9 +363,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# OCRmyPDF --output-type options are available.
# TODO: validate this setting.
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
# skip. redo, force
# TODO: validate this.
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -350,11 +382,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
# Pre-2.x versions of Paperless stored your documents locally with GPG

View File

@@ -4,6 +4,7 @@ from django.contrib.auth.decorators import login_required
from django.urls import path, re_path
from django.views.decorators.csrf import csrf_exempt
from django.views.generic import RedirectView
from rest_framework.authtoken import views
from rest_framework.routers import DefaultRouter
from documents.views import (
@@ -15,7 +16,8 @@ from documents.views import (
SearchView,
IndexView,
SearchAutoCompleteView,
StatisticsView
StatisticsView,
PostDocumentView
)
from paperless.views import FaviconView
@@ -45,6 +47,11 @@ urlpatterns = [
StatisticsView.as_view(),
name="statistics"),
re_path(r"^documents/post_document/", PostDocumentView.as_view(),
name="post_document"),
path('token/', views.obtain_auth_token)
] + api_router.urls)),
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),

View File

@@ -1 +1 @@
__version__ = (0, 9, 3)
__version__ = (0, 9, 5)

View File

@@ -4,6 +4,7 @@ from datetime import timedelta, date
import magic
from django.conf import settings
from django.db import DatabaseError
from django.utils.text import slugify
from django_q.tasks import async_task
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
@@ -86,46 +87,6 @@ def make_criterias(rule):
return {**criterias, **get_rule_action(rule).get_criteria()}
def get_title(message, att, rule):
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
title = message.subject
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
title = os.path.splitext(os.path.basename(att.filename))[0]
else:
raise ValueError("Unknown title selector.")
return title
def get_correspondent(message, rule):
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
correspondent = None
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
correspondent_name = message.from_
correspondent = Correspondent.objects.get_or_create(
name=correspondent_name, defaults={
"slug": slugify(correspondent_name)
})[0]
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
if message.from_values and \
'name' in message.from_values \
and message.from_values['name']:
correspondent_name = message.from_values['name']
else:
correspondent_name = message.from_
correspondent = Correspondent.objects.get_or_create(
name=correspondent_name, defaults={
"slug": slugify(correspondent_name)
})[0]
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
correspondent = rule.assign_correspondent
else:
raise ValueError("Unknwown correspondent selector")
return correspondent
def get_mailbox(server, port, security):
if security == MailAccount.IMAP_SECURITY_NONE:
mailbox = MailBoxUnencrypted(server, port)
@@ -140,6 +101,51 @@ def get_mailbox(server, port, security):
class MailAccountHandler(LoggingMixin):
def _correspondent_from_name(self, name):
try:
return Correspondent.objects.get_or_create(
name=name, defaults={
"slug": slugify(name)
})[0]
except DatabaseError as e:
self.log(
"error",
f"Error while retrieving correspondent {name}: {e}"
)
return None
def get_title(self, message, att, rule):
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
return message.subject
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
return os.path.splitext(os.path.basename(att.filename))[0]
else:
raise ValueError("Unknown title selector.")
def get_correspondent(self, message, rule):
c_from = rule.assign_correspondent_from
if c_from == MailRule.CORRESPONDENT_FROM_NOTHING:
return None
elif c_from == MailRule.CORRESPONDENT_FROM_EMAIL:
return self._correspondent_from_name(message.from_)
elif c_from == MailRule.CORRESPONDENT_FROM_NAME:
if message.from_values and 'name' in message.from_values and message.from_values['name']: # NOQA: E501
return self._correspondent_from_name(
message.from_values['name'])
else:
return self._correspondent_from_name(message.from_)
elif c_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
return rule.assign_correspondent
else:
raise ValueError("Unknwown correspondent selector")
def handle_mail_account(self, account):
self.renew_logging_group()
@@ -156,79 +162,89 @@ class MailAccountHandler(LoggingMixin):
M.login(account.username, account.password)
except Exception:
raise MailError(
f"Error while authenticating account {account.name}")
f"Error while authenticating account {account}")
self.log('debug', f"Account {account}: Processing "
f"{account.rules.count()} rule(s)")
for rule in account.rules.order_by('order'):
self.log(
'debug',
f"Account {account}: Processing rule {rule.name}")
self.log(
'debug',
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} "
f"does not exist in account {account.name}")
total_processed_files += self.handle_mail_rule(M, rule)
except Exception as e:
self.log(
"error",
f"Rule {rule}: Error while processing rule: {e}",
exc_info=True
)
criterias = make_criterias(rule)
return total_processed_files
def handle_mail_rule(self, M, rule):
self.log(
'debug',
f"Rule {rule}: Selecting folder {rule.folder}")
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule}: Folder {rule.folder} "
f"does not exist in account {rule.account}")
criterias = make_criterias(rule)
self.log(
'debug',
f"Rule {rule}: Searching folder with criteria "
f"{str(AND(**criterias))}")
try:
messages = M.fetch(criteria=AND(**criterias),
mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule}: Error while fetching folder {rule.folder}")
post_consume_messages = []
mails_processed = 0
total_processed_files = 0
for message in messages:
try:
processed_files = self.handle_message(message, rule)
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
mails_processed += 1
except Exception as e:
self.log(
'debug',
f"Rule {account}.{rule}: Searching folder with criteria "
f"{str(AND(**criterias))}")
"error",
f"Rule {rule}: Error while processing mail "
f"{message.uid}: {e}",
exc_info=True)
try:
messages = M.fetch(criteria=AND(**criterias),
mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
self.log(
'debug',
f"Rule {rule}: Processed {mails_processed} matching mail(s)")
post_consume_messages = []
self.log(
'debug',
f"Rule {rule}: Running mail actions on "
f"{len(post_consume_messages)} mails")
mails_processed = 0
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
for message in messages:
try:
processed_files = self.handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
mails_processed += 1
self.log(
'debug',
f"Rule {account}.{rule}: Processed {mails_processed} "
f"matching mail(s)")
self.log(
'debug',
f"Rule {account}.{rule}: Running mail actions on "
f"{len(post_consume_messages)} mails")
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing "
f"post-consume actions for account {account.name}")
except Exception as e:
raise MailError(
f"Rule {rule}: Error while processing post-consume actions: "
f"{e}")
return total_processed_files
@@ -238,11 +254,11 @@ class MailAccountHandler(LoggingMixin):
self.log(
'debug',
f"Rule {rule.account}.{rule}: "
f"Rule {rule}: "
f"Processing mail {message.subject} from {message.from_} with "
f"{len(message.attachments)} attachment(s)")
correspondent = get_correspondent(message, rule)
correspondent = self.get_correspondent(message, rule)
tag = rule.assign_tag
doc_type = rule.assign_document_type
@@ -253,12 +269,12 @@ class MailAccountHandler(LoggingMixin):
if not att.content_disposition == "attachment":
self.log(
'debug',
f"Rule {rule.account}.{rule}: "
f"Rule {rule}: "
f"Skipping attachment {att.filename} "
f"with content disposition inline")
f"with content disposition {att.content_disposition}")
continue
title = get_title(message, att, rule)
title = self.get_title(message, att, rule)
# don't trust the content type of the attachment. Could be
# generic application/octet-stream.
@@ -274,7 +290,7 @@ class MailAccountHandler(LoggingMixin):
self.log(
'info',
f"Rule {rule.account}.{rule}: "
f"Rule {rule}: "
f"Consuming attachment {att.filename} from mail "
f"{message.subject} from {message.from_}")
@@ -293,7 +309,7 @@ class MailAccountHandler(LoggingMixin):
else:
self.log(
'debug',
f"Rule {rule.account}.{rule}: "
f"Rule {rule}: "
f"Skipping attachment {att.filename} "
f"since guessed mime type {mime_type} is not supported "
f"by paperless")

View File

@@ -139,4 +139,4 @@ class MailRule(models.Model):
)
def __str__(self):
return self.name
return f"{self.account.name}.{self.name}"

View File

@@ -1,14 +1,20 @@
import logging
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailAccountHandler, MailError
from paperless_mail.models import MailAccount
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
total_new_documents += MailAccountHandler().handle_mail_account(
account)
try:
total_new_documents += MailAccountHandler().handle_mail_account(
account)
except MailError as e:
logging.getLogger(__name__).error(
f"Error while processing mail account {account}: {e}",
exc_info=True
)
if total_new_documents > 0:
return f"Added {total_new_documents} document(s)."
@@ -17,8 +23,8 @@ def process_mail_accounts():
def process_mail_account(name):
account = MailAccount.objects.find(name=name)
if account:
try:
account = MailAccount.objects.get(name=name)
MailAccountHandler().handle_mail_account(account)
else:
logging.error("Unknown mail acccount: {}".format(name))
except MailAccount.DoesNotExist:
logging.getLogger(__name__).error(f"Unknown mail acccount: {name}")

View File

@@ -3,11 +3,14 @@ from collections import namedtuple
from typing import ContextManager
from unittest import mock
from django.core.management import call_command
from django.db import DatabaseError
from django.test import TestCase
from imap_tools import MailMessageFlags, MailboxFolderSelectError
from documents.models import Correspondent
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
from paperless_mail import tasks
from paperless_mail.mail import MailError, MailAccountHandler
from paperless_mail.models import MailRule, MailAccount
@@ -163,28 +166,30 @@ class TestMail(TestCase):
me_localhost = Correspondent.objects.create(name=message2.from_)
someone_else = Correspondent.objects.create(name="someone else")
handler = MailAccountHandler()
rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
self.assertIsNone(get_correspondent(message, rule))
self.assertIsNone(handler.get_correspondent(message, rule))
rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
c = get_correspondent(message, rule)
c = handler.get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "someone@somewhere.com")
c = get_correspondent(message2, rule)
c = handler.get_correspondent(message2, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "me@localhost.com")
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
c = get_correspondent(message, rule)
c = handler.get_correspondent(message, rule)
self.assertIsNotNone(c)
self.assertEqual(c.name, "Someone!")
c = get_correspondent(message2, rule)
c = handler.get_correspondent(message2, rule)
self.assertIsNotNone(c)
self.assertEqual(c.id, me_localhost.id)
rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
c = get_correspondent(message, rule)
c = handler.get_correspondent(message, rule)
self.assertEqual(c, someone_else)
def test_get_title(self):
@@ -192,10 +197,13 @@ class TestMail(TestCase):
message.subject = "the message title"
att = namedtuple('Attachment', [])
att.filename = "this_is_the_file.pdf"
handler = MailAccountHandler()
rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
self.assertEqual(handler.get_title(message, att, rule), "this_is_the_file")
rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
self.assertEqual(get_title(message, att, rule), "the message title")
self.assertEqual(handler.get_title(message, att, rule), "the message title")
def test_handle_message(self):
message = create_message(subject="the message title", from_="Myself", num_attachments=2)
@@ -317,7 +325,7 @@ class TestMail(TestCase):
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
def test_errors(self):
def test_error_login(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
try:
@@ -327,26 +335,84 @@ class TestMail(TestCase):
else:
self.fail("Should raise exception")
def test_error_skip_account(self):
account_faulty = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wroasdng")
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
action_parameter="spam", filter_subject="Claim")
tasks.process_mail_accounts()
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
def test_error_skip_rule(self):
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
action_parameter="spam", filter_subject="Claim", order=1, folder="uuuhhhh")
rule2 = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE,
action_parameter="spam", filter_subject="Claim", order=2)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
@mock.patch("paperless_mail.mail.MailAccountHandler.get_correspondent")
def test_error_skip_mail(self, m):
def get_correspondent_fake(message, rule):
if message.from_ == 'amazon@amazon.de':
raise ValueError("Does not compute.")
else:
return None
m.side_effect = get_correspondent_fake
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam")
self.mail_account_handler.handle_mail_account(account)
# test that we still consume mail even if some mails throw errors.
self.assertEqual(self.async_task.call_count, 2)
# faulty mail still in inbox, untouched
self.assertEqual(len(self.bogus_mailbox.messages), 1)
self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de')
def test_error_create_correspondent(self):
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
rule = MailRule.objects.create(
name="testrule", filter_from="amazon@amazon.de",
account=account, action=MailRule.ACTION_MOVE, action_parameter="spam",
assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
self.mail_account_handler.handle_mail_account(account)
self.async_task.assert_called_once()
args, kwargs = self.async_task.call_args
c = Correspondent.objects.get(name="amazon@amazon.de")
# should work
self.assertEquals(kwargs['override_correspondent_id'], c.id)
self.async_task.reset_mock()
self.reset_bogus_mailbox()
with mock.patch("paperless_mail.mail.Correspondent.objects.get_or_create") as m:
m.side_effect = DatabaseError()
try:
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue("uuuh does not exist" in str(e))
else:
self.fail("Should raise exception")
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
args, kwargs = self.async_task.call_args
self.async_task.assert_called_once()
self.assertEquals(kwargs['override_correspondent_id'], None)
rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
try:
self.mail_account_handler.handle_mail_account(account)
except MailError as e:
self.assertTrue("Error while processing post-consume actions" in str(e))
else:
self.fail("Should raise exception")
def test_filters(self):
@@ -390,3 +456,43 @@ class TestMail(TestCase):
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 5)
class TestManagementCommand(TestCase):
@mock.patch("paperless_mail.management.commands.mail_fetcher.tasks.process_mail_accounts")
def test_mail_fetcher(self, m):
call_command("mail_fetcher")
m.assert_called_once()
class TestTasks(TestCase):
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
def test_all_accounts(self, m):
m.side_effect = lambda account: 6
MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
MailAccount.objects.create(name="B", imap_server="A", username="A", password="A")
result = tasks.process_mail_accounts()
self.assertEqual(m.call_count, 2)
self.assertIn("Added 12", result)
m.side_effect = lambda account: 0
result = tasks.process_mail_accounts()
self.assertIn("No new", result)
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
def test_single_accounts(self, m):
MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
tasks.process_mail_account("A")
m.assert_called_once()
m.reset_mock()
tasks.process_mail_account("B")
m.assert_not_called()

View File

@@ -14,12 +14,21 @@ def get_tesseract_langs():
@register()
def check_default_language_available(app_configs, **kwargs):
langs = get_tesseract_langs()
installed_langs = get_tesseract_langs()
if settings.OCR_LANGUAGE not in langs:
return [Error(
f"The default ocr language {settings.OCR_LANGUAGE} is "
f"not installed. Paperless cannot OCR your documents "
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
else:
return []
if not settings.OCR_LANGUAGE:
return [Warning(
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
"This means that tesseract will fallback to english."
)]
specified_langs = settings.OCR_LANGUAGE.split("+")
for lang in specified_langs:
if lang not in installed_langs:
return [Error(
f"The selected ocr language {lang} is "
f"not installed. Paperless cannot OCR your documents "
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
return []

View File

@@ -1,23 +1,15 @@
import itertools
import json
import os
import re
import subprocess
from multiprocessing.pool import ThreadPool
import langdetect
import ocrmypdf
import pdftotext
import pyocr
from PIL import Image
from django.conf import settings
from pyocr import PyocrException
from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
run_convert
from .languages import ISO639
class OCRError(Exception):
pass
from documents.parsers import DocumentParser, ParseError, run_convert
class RasterisedDocumentParser(DocumentParser):
@@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
def __init__(self, path, logging_group):
super().__init__(path, logging_group)
self._text = None
def get_thumbnail(self):
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
@@ -43,8 +31,8 @@ class RasterisedDocumentParser(DocumentParser):
scale="500x5000>",
alpha="remove",
strip=True,
trim=True,
input_file="{}[0]".format(self.document_path),
trim=False,
input_file="{}[0]".format(document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
@@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
self.document_path]
document_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
@@ -67,176 +55,160 @@ class RasterisedDocumentParser(DocumentParser):
scale="500x5000>",
alpha="remove",
strip=True,
trim=True,
trim=False,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
def _is_ocred(self):
# Extract text from PDF using pdftotext
text = get_text_from_pdf(self.document_path)
# We assume, that a PDF with at least 50 characters contains text
# (so no OCR required)
return len(text) > 50
def get_text(self):
if self._text is not None:
return self._text
if not settings.OCR_ALWAYS and self._is_ocred():
self.log("debug", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path)
return self._text
images = self._get_greyscale()
if not images:
raise ParseError("Empty document, nothing to do.")
def is_image(self, mime_type):
return mime_type in [
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
]
def get_dpi(self, image):
try:
sample_page_index = int(len(images) / 2)
self.log(
"debug",
f"Attempting language detection on page "
f"{sample_page_index + 1} of {len(images)}...")
sample_page_text = self._ocr([images[sample_page_index]],
settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed.")
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
self.log(
"debug",
f"Detected language: {guessed_language} "
f"(default language)")
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
self.log(
"warning",
f"Detected language {guessed_language} is not available "
f"on this system.")
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
else:
self.log("debug", f"Detected language: {guessed_language}")
ocr_pages = self._ocr(images, ISO639[guessed_language])
self.log("debug", "OCR completed.")
self._text = strip_excess_whitespace(" ".join(ocr_pages))
return self._text
except OCRError as e:
raise ParseError(e)
def _get_greyscale(self):
"""
Greyscale images are easier for Tesseract to OCR
"""
# Convert PDF to multiple PNMs
input_file = self.document_path
if settings.OCR_PAGES == 1:
input_file += "[0]"
elif settings.OCR_PAGES > 1:
input_file += f"[0-{settings.OCR_PAGES - 1}]"
self.log(
"debug",
f"Converting document {input_file} into greyscale images")
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert(density=settings.CONVERT_DENSITY,
depth="8",
type="grayscale",
input_file=input_file,
output_file=output_files,
logging_group=self.logging_group)
# Get a list of converted images
pnms = []
for f in os.listdir(self.tempdir):
if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f))
self.log("debug", f"Running unpaper on {len(pnms)} pages...")
# Run unpaper in parallel on converted images
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
pnms = pool.map(run_unpaper, pnms)
return sorted(filter(lambda __: os.path.isfile(__), pnms))
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
return guess
with Image.open(image) as im:
x, y = im.info['dpi']
return x
except Exception as e:
self.log('warning', f"Language detection failed with: {e}")
self.log(
'warning',
f"Error while getting DPI from image {image}: {e}")
return None
def _ocr(self, imgs, lang):
self.log(
"debug",
f"Performing OCR on {len(imgs)} page(s) with language {lang}")
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
return r
def parse(self, document_path, mime_type):
mode = settings.OCR_MODE
def _complete_ocr_default_language(self,
images,
sample_page_index,
sample_page):
images_copy = list(images)
del images_copy[sample_page_index]
if images_copy:
self.log('debug', "Continuing ocr with default language.")
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages
text_original = get_text_from_pdf(document_path)
has_text = text_original and len(text_original) > 50
if mode == "skip_noarchive" and has_text:
self.log("debug",
"Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
if mode in ['skip', 'skip_noarchive'] and not has_text:
# upgrade to redo, since there appears to be no text in the
# document. This happens to some weird encrypted documents or
# documents with failed OCR attempts for which OCRmyPDF will
# still report that there actually is text in them.
self.log("debug",
"No text was found in the document and skip is "
"specified. Upgrading OCR mode to redo.")
mode = "redo"
archive_path = os.path.join(self.tempdir, "archive.pdf")
ocr_args = {
'input_file': document_path,
'output_file': archive_path,
'use_threads': True,
'jobs': settings.THREADS_PER_WORKER,
'language': settings.OCR_LANGUAGE,
'output_type': settings.OCR_OUTPUT_TYPE,
'progress_bar': False,
'clean': True
}
if settings.OCR_PAGES > 0:
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
# Mode selection.
if mode in ['skip', 'skip_noarchive']:
ocr_args['skip_text'] = True
elif mode == 'redo':
ocr_args['redo_ocr'] = True
elif mode == 'force':
ocr_args['force_ocr'] = True
else:
return [sample_page]
raise ParseError(
f"Invalid ocr mode: {mode}")
if self.is_image(mime_type):
dpi = self.get_dpi(document_path)
if dpi:
self.log(
"debug",
f"Detected DPI for image {document_path}: {dpi}"
)
ocr_args['image_dpi'] = dpi
elif settings.OCR_IMAGE_DPI:
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
else:
raise ParseError(
f"Cannot produce archive PDF for image {document_path}, "
f"no DPI information is present in this image and "
f"OCR_IMAGE_DPI is not set.")
if settings.OCR_USER_ARGS:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
ocr_args = {**ocr_args, **user_args}
except Exception as e:
self.log(
"warning",
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used: {e}")
# This forces tesseract to use one core per page.
os.environ['OMP_THREAD_LIMIT'] = "1"
try:
self.log("debug",
f"Calling OCRmyPDF with {str(ocr_args)}")
ocrmypdf.ocr(**ocr_args)
# success! announce results
self.archive_path = archive_path
self.text = get_text_from_pdf(archive_path)
except (InputFileError, EncryptedPdfError) as e:
self.log("debug",
f"Encountered an error: {e}. Trying to use text from "
f"original.")
# This happens with some PDFs when used with the redo_ocr option.
# This is not the end of the world, we'll just use what we already
# have in the document.
self.text = text_original
# Also, no archived file.
if not self.text:
# However, if we don't have anything, fail:
raise ParseError(e)
except Exception as e:
# Anything else is probably serious.
raise ParseError(e)
if not self.text:
# This may happen for files that don't have any text.
self.log(
'warning',
f"Document {document_path} does not have any text."
f"This is probably an error or you tried to add an image "
f"without text, or something is wrong with this document.")
self.text = ""
def strip_excess_whitespace(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub(
r"([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(img) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except Exception:
# Rotation not possible, ignore
pass
try:
return ocr.image_to_string(f, lang=lang)
except PyocrException as e:
raise OCRError(e)
# TODO: this needs a rework
return no_trailing_whitespace.strip()
def get_text_from_pdf(pdf_file):
@@ -245,6 +217,9 @@ def get_text_from_pdf(pdf_file):
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
return ""
# might not be a PDF file
return None
return "\n".join(pdf)
text = "\n".join(pdf)
return strip_excess_whitespace(text)

View File

@@ -5,9 +5,12 @@ def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"mime_types": [
"application/pdf",
"image/jpeg",
"image/png"
]
"mime_types": {
"application/pdf": ".pdf",
"image/jpeg": ".jpg",
"image/png": ".png",
"image/tiff": ".tif",
"image/gif": ".gif",
"image/bmp": ".bmp",
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.7 KiB

After

Width:  |  Height:  |  Size: 7.2 KiB

Binary file not shown.

Binary file not shown.

View File

@@ -1,193 +0,0 @@
import datetime
import os
import shutil
from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase, override_settings
from ..parsers import RasterisedDocumentParser
class TestDate(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
def setUp(self):
os.makedirs(self.SCRATCH, exist_ok=True)
def tearDown(self):
shutil.rmtree(self.SCRATCH)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_1(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(document.get_date(), None)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_2(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(document.get_date(), None)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_3(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(document.get_date(), None)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_4(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
date = document.get_date()
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_5(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = (
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
"ipsum"
)
date = document.get_date()
self.assertEqual(
date,
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_6(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum"
)
self.assertEqual(document.get_date(), None)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_7(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = (
"lorem ipsum\n"
"März 2019\n"
"lorem ipsum"
)
date = document.get_date()
self.assertEqual(
date,
datetime.datetime(
2019, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_8(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020"
)
self.assertEqual(
document.get_date(),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_9(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file, None)
document._text = (
"lorem ipsum\n"
"27. Nullmonth 2020\n"
"März 2020\n"
"lorem ipsum"
)
self.assertEqual(
document.get_date(),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null", None)
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-2350 00:00:00"
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_crazy_date_future(self, *args):
document = RasterisedDocumentParser("/dev/null", None)
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="20 408000l 2475"
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_crazy_date_with_spaces(self, *args):
document = RasterisedDocumentParser("/dev/null", None)
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="No date in here"
)
@override_settings(FILENAME_DATE_ORDER="YMD")
@override_settings(SCRATCH_DIR=SCRATCH)
def test_filename_date_parse_invalid(self, *args):
document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None)
document.get_text()
self.assertIsNone(document.get_date())

View File

@@ -1,76 +0,0 @@
import os
from unittest import mock, skipIf
import pyocr
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from ..parsers import image_to_string, strip_excess_whitespace
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise OtherTesseractError("arbitrary status", "message")
@staticmethod
def image_to_string(file_handle, lang):
return "This is test text"
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
class TestOCR(TestCase):
text_cases = [
("simple string", "simple string"),
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
]
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
def test_image_to_string_with_text_free_page(self):
"""
This test is sort of silly, since it's really just reproducing an odd
exception thrown by pyocr when it encounters a page with no text.
Actually running this test against an installation of Tesseract results
in a segmentation fault rooted somewhere deep inside pyocr where I
don't care to dig. Regardless, if you run the consumer normally,
text-free pages are now handled correctly so long as we work around
this weird exception.
"""
image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])

View File

@@ -1,46 +1,17 @@
import os
import shutil
import tempfile
import uuid
from typing import ContextManager
from unittest import mock
from django.test import TestCase, override_settings
from pyocr.error import TesseractError
from documents.parsers import ParseError, run_convert
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
from documents.tests.utils import DirectoriesMixin
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace
image_to_string_calls = []
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise TesseractError("arbitrary status", "message")
@staticmethod
def get_available_languages():
return ['eng', 'deu']
@staticmethod
def image_to_string(file_handle, lang):
image_to_string_calls.append((file_handle.name, lang))
return file_handle.read()
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
def fake_convert(input_file, output_file, **kwargs):
with open(input_file) as f:
lines = f.readlines()
@@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs):
f2.write(line.strip())
def fake_unpaper(pnm):
output = pnm + ".unpaper.pnm"
shutil.copy(pnm, output)
return output
class FakeImageFile(ContextManager):
def __init__(self, fname):
self.fname = fname
@@ -67,142 +32,50 @@ class FakeImageFile(ContextManager):
return os.path.basename(self.fname)
fake_image = FakeImageFile
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
@mock.patch("paperless_tesseract.parsers.Image.open", open)
class TestRasterisedDocumentParser(TestCase):
class TestParser(DirectoriesMixin, TestCase):
def setUp(self):
self.scratch = tempfile.mkdtemp()
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = [content.index(s) for s in strings]
self.assertListEqual(indices, sorted(indices))
global image_to_string_calls
text_cases = [
("simple string", "simple string"),
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
]
image_to_string_calls = []
override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
def tearDown(self):
shutil.rmtree(self.scratch)
def get_input_file(self, pages):
_, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
with open(fname, "w") as f:
f.writelines([f"line {p}\n" for p in range(pages)])
return fname
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
def test_parse_text_simple_language_match(self):
parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
def test_parse_text_2_pages(self):
parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
def test_parse_text_3_pages(self):
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
def test_parse_text_lang_detect_failed(self):
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
def test_parse_text_lang_not_installed(self):
parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2 line 3")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
def test_parse_text_lang_mismatch(self):
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
text = parser.get_text()
self.assertEqual(text, "line 0 line 1 line 2")
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
def test_parse_empty_doc(self):
parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
try:
parser.get_text()
except ParseError as e:
self.assertEqual("Empty document, nothing to do.", str(e))
else:
self.fail("Should raise exception")
class TestAuxilliaryFunctions(TestCase):
def setUp(self):
self.scratch = tempfile.mkdtemp()
override_settings(SCRATCH_DIR=self.scratch).enable()
def tearDown(self):
shutil.rmtree(self.scratch)
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_get_text_from_pdf(self):
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'))
self.assertEqual(text.strip(), "This is a test document.")
def test_get_text_from_pdf_error(self):
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
self.assertEqual(text.strip(), "")
def test_image_to_string(self):
text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
self.assertEqual(text, "This is a test document.")
def test_image_to_string_language_unavailable(self):
try:
image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
except OCRError as e:
self.assertTrue("Failed loading language" in str(e))
else:
self.fail("Should raise exception")
@override_settings(OCR_ALWAYS=False)
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
def test_is_ocred(self, m2, m):
parser = RasterisedDocumentParser("", uuid.uuid4())
m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
"lots of text lots of text lots of text lots of text lots of text lots of text " \
"lots of text lots of text lots of text lots of text lots of text lots of text "
parser.get_text()
self.assertEqual(m.call_count, 2)
self.assertEqual(m2.call_count, 0)
self.assertContainsStrings(text.strip(), ["This is a test document."])
def test_thumbnail(self):
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
parser.get_thumbnail()
parser = RasterisedDocumentParser(uuid.uuid4())
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
# dont really know how to test it, just call it and assert that it does not raise anything.
@mock.patch("paperless_tesseract.parsers.run_convert")
@@ -216,6 +89,191 @@ class TestAuxilliaryFunctions(TestCase):
m.side_effect = call_convert
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
parser.get_thumbnail()
parser = RasterisedDocumentParser(uuid.uuid4())
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
# dont really know how to test it, just call it and assert that it does not raise anything.
def test_get_dpi(self):
parser = RasterisedDocumentParser(None)
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
self.assertEqual(dpi, None)
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
self.assertEqual(dpi, 72)
def test_simple_digital(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
def test_with_form(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
@override_settings(OCR_MODE="redo")
def test_with_form_error(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
@override_settings(OCR_MODE="redo")
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None)
def test_with_form_error_notext(self):
parser = RasterisedDocumentParser(None)
def f():
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
self.assertRaises(ParseError, f)
@override_settings(OCR_MODE="force")
def test_with_form_force(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
def test_image_simple(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
def test_image_simple_alpha_fail(self):
parser = RasterisedDocumentParser(None)
def f():
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png")
self.assertRaises(ParseError, f)
def test_image_no_dpi_fail(self):
parser = RasterisedDocumentParser(None)
def f():
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
self.assertRaises(ParseError, f)
@override_settings(OCR_IMAGE_DPI=72)
def test_image_no_dpi_default(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
def test_multi_page(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
def test_multi_page_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_pages_redo(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
@override_settings(OCR_PAGES=2, OCR_MODE="force")
def test_multi_page_pages_force(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
@override_settings(OOCR_MODE="skip")
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_analog_pages_redo(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
self.assertFalse("page 3" in parser.get_text().lower())
@override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
self.assertFalse("page 2" in parser.get_text().lower())
self.assertFalse("page 3" in parser.get_text().lower())
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
self.assertTrue(os.path.join(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
class TestParserFileTypes(DirectoriesMixin, TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_bmp(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
def test_jpg(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())
def test_tiff(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertTrue("this is a test document" in parser.get_text().lower())

View File

@@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
This parser directly parses a text document (.txt, .md, or .csv)
"""
def __init__(self, path, logging_group):
super().__init__(path, logging_group)
self._text = None
def get_thumbnail(self):
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a text file is just a 500px wide image of the text
rendered onto a letter-sized page.
@@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
)
def read_text():
with open(self.document_path, 'r') as src:
with open(document_path, 'r') as src:
lines = [line.strip() for line in src.readlines()]
text = "\n".join([line for line in lines[:n_lines]])
return text.replace('"', "'")
@@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
return out_path
def get_text(self):
if self._text is not None:
return self._text
with open(self.document_path, 'r') as f:
self._text = f.read()
return self._text
def parse(self, document_path, mime_type):
with open(document_path, 'r') as f:
self.text = f.read()
def run_command(*args):

View File

@@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"mime_types": [
"text/plain",
"text/comma-separated-values"
]
"mime_types": {
"text/plain": ".txt",
"text/csv": ".csv",
}
}