2021-05-16 01:22:51 +02:00

627 lines
21 KiB
Python

import logging
import os
import tempfile
import uuid
import zipfile
from datetime import datetime
from time import mktime
from django.conf import settings
from django.db.models import Count, Max, Case, When, IntegerField
from django.db.models.functions import Lower
from django.http import HttpResponse, HttpResponseBadRequest, Http404
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django_q.tasks import async_task
from rest_framework import parsers
from rest_framework.decorators import action
from rest_framework.exceptions import NotFound
from rest_framework.filters import OrderingFilter, SearchFilter
from rest_framework.generics import GenericAPIView
from rest_framework.mixins import (
DestroyModelMixin,
ListModelMixin,
RetrieveModelMixin,
UpdateModelMixin
)
from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.viewsets import (
GenericViewSet,
ModelViewSet,
ViewSet
)
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .bulk_download import OriginalAndArchiveStrategy, OriginalsOnlyStrategy, \
ArchiveOnlyStrategy
from .classifier import load_classifier
from .filters import (
CorrespondentFilterSet,
DocumentFilterSet,
TagFilterSet,
DocumentTypeFilterSet
)
from .matching import match_correspondents, match_tags, match_document_types
from .models import Correspondent, Document, Tag, DocumentType, SavedView
from .parsers import get_parser_class_for_mime_type
from .serialisers import (
CorrespondentSerializer,
DocumentSerializer,
TagSerializerVersion1,
TagSerializer,
DocumentTypeSerializer,
PostDocumentSerializer,
SavedViewSerializer,
BulkEditSerializer,
DocumentListSerializer,
BulkDownloadSerializer
)
logger = logging.getLogger("paperless.api")
class IndexView(TemplateView):
template_name = "index.html"
def get_language(self):
# This is here for the following reason:
# Django identifies languages in the form "en-us"
# However, angular generates locales as "en-US".
# this translates between these two forms.
lang = get_language()
if "-" in lang:
first = lang[:lang.index("-")]
second = lang[lang.index("-")+1:]
return f"{first}-{second.upper()}"
else:
return lang
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['cookie_prefix'] = settings.COOKIE_PREFIX
context['username'] = self.request.user.username
context['full_name'] = self.request.user.get_full_name()
context['styles_css'] = f"frontend/{self.get_language()}/styles.css"
context['runtime_js'] = f"frontend/{self.get_language()}/runtime.js"
context['polyfills_js'] = f"frontend/{self.get_language()}/polyfills.js" # NOQA: E501
context['main_js'] = f"frontend/{self.get_language()}/main.js"
context['webmanifest'] = f"frontend/{self.get_language()}/manifest.webmanifest" # NOQA: E501
context['apple_touch_icon'] = f"frontend/{self.get_language()}/apple-touch-icon.png" # NOQA: E501
return context
class CorrespondentViewSet(ModelViewSet):
model = Correspondent
queryset = Correspondent.objects.annotate(
document_count=Count('documents'),
last_correspondence=Max('documents__created')).order_by(Lower('name'))
serializer_class = CorrespondentSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filterset_class = CorrespondentFilterSet
ordering_fields = (
"name",
"matching_algorithm",
"match",
"document_count",
"last_correspondence")
class TagViewSet(ModelViewSet):
model = Tag
queryset = Tag.objects.annotate(
document_count=Count('documents')).order_by(Lower('name'))
def get_serializer_class(self):
if int(self.request.version) == 1:
return TagSerializerVersion1
else:
return TagSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filterset_class = TagFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
class DocumentTypeViewSet(ModelViewSet):
model = DocumentType
queryset = DocumentType.objects.annotate(
document_count=Count('documents')).order_by(Lower('name'))
serializer_class = DocumentTypeSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
filterset_class = DocumentTypeFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
class DocumentViewSet(RetrieveModelMixin,
UpdateModelMixin,
DestroyModelMixin,
ListModelMixin,
GenericViewSet):
model = Document
queryset = Document.objects.all()
serializer_class = DocumentSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
filterset_class = DocumentFilterSet
search_fields = ("title", "correspondent__name", "content")
ordering_fields = (
"id",
"title",
"correspondent__name",
"document_type__name",
"created",
"modified",
"added",
"archive_serial_number")
def get_queryset(self):
return Document.objects.distinct()
def get_serializer(self, *args, **kwargs):
fields_param = self.request.query_params.get('fields', None)
if fields_param:
fields = fields_param.split(",")
else:
fields = None
serializer_class = self.get_serializer_class()
kwargs.setdefault('context', self.get_serializer_context())
kwargs.setdefault('fields', fields)
return serializer_class(*args, **kwargs)
def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update(
request, *args, **kwargs)
from documents import index
index.add_or_update_document(self.get_object())
return response
def destroy(self, request, *args, **kwargs):
from documents import index
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
@staticmethod
def original_requested(request):
return (
'original' in request.query_params and
request.query_params['original'] == 'true'
)
def file_response(self, pk, request, disposition):
doc = Document.objects.get(id=pk)
if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501
file_handle = doc.archive_file
filename = doc.get_public_filename(archive=True)
mime_type = 'application/pdf'
else:
file_handle = doc.source_file
filename = doc.get_public_filename()
mime_type = doc.mime_type
if doc.storage_type == Document.STORAGE_TYPE_GPG:
file_handle = GnuPG.decrypted(file_handle)
response = HttpResponse(file_handle, content_type=mime_type)
response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, filename)
return response
def get_metadata(self, file, mime_type):
if not os.path.isfile(file):
return None
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
parser = parser_class(progress_callback=None, logging_group=None)
try:
return parser.extract_metadata(file, mime_type)
except Exception as e:
# TODO: cover GPG errors, remove later.
return []
else:
return []
def get_filesize(self, filename):
if os.path.isfile(filename):
return os.stat(filename).st_size
else:
return None
@action(methods=['get'], detail=True)
def metadata(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
meta = {
"original_checksum": doc.checksum,
"original_size": self.get_filesize(doc.source_path),
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": doc.has_archive_version,
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type),
"archive_checksum": doc.archive_checksum,
"archive_media_filename": doc.archive_filename
}
if doc.has_archive_version:
meta['archive_size'] = self.get_filesize(doc.archive_path)
meta['archive_metadata'] = self.get_metadata(
doc.archive_path, "application/pdf")
else:
meta['archive_size'] = None
meta['archive_metadata'] = None
return Response(meta)
@action(methods=['get'], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
classifier = load_classifier()
return Response({
"correspondents": [
c.id for c in match_correspondents(doc, classifier)
],
"tags": [t.id for t in match_tags(doc, classifier)],
"document_types": [
dt.id for dt in match_document_types(doc, classifier)
]
})
@action(methods=['get'], detail=True)
def preview(self, request, pk=None):
try:
response = self.file_response(
pk, request, "inline")
return response
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
@action(methods=['get'], detail=True)
@cache_control(public=False, max_age=315360000)
def thumb(self, request, pk=None):
try:
doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_GPG:
handle = GnuPG.decrypted(doc.thumbnail_file)
else:
handle = doc.thumbnail_file
# TODO: Send ETag information and use that to send new thumbnails
# if available
return HttpResponse(handle,
content_type='image/png')
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
@action(methods=['get'], detail=True)
def download(self, request, pk=None):
try:
return self.file_response(
pk, request, "attachment")
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
class SearchResultSerializer(DocumentSerializer):
def to_representation(self, instance):
doc = Document.objects.get(id=instance['id'])
r = super(SearchResultSerializer, self).to_representation(doc)
r['__search_hit__'] = {
"score": instance.score,
"highlights": instance.highlights("content",
text=doc.content) if doc else None, # NOQA: E501
"rank": instance.rank
}
return r
class UnifiedSearchViewSet(DocumentViewSet):
def __init__(self, *args, **kwargs):
super(UnifiedSearchViewSet, self).__init__(*args, **kwargs)
self.searcher = None
def get_serializer_class(self):
if self._is_search_request():
return SearchResultSerializer
else:
return DocumentSerializer
def _is_search_request(self):
return ("query" in self.request.query_params or
"more_like_id" in self.request.query_params)
def filter_queryset(self, queryset):
if self._is_search_request():
from documents import index
if "query" in self.request.query_params:
query_class = index.DelayedFullTextQuery
elif "more_like_id" in self.request.query_params:
query_class = index.DelayedMoreLikeThisQuery
else:
raise ValueError()
return query_class(
self.searcher,
self.request.query_params,
self.paginator.get_page_size(self.request))
else:
return super(UnifiedSearchViewSet, self).filter_queryset(queryset)
def list(self, request, *args, **kwargs):
if self._is_search_request():
from documents import index
try:
with index.open_index_searcher() as s:
self.searcher = s
return super(UnifiedSearchViewSet, self).list(request)
except NotFound:
raise
except Exception as e:
return HttpResponseBadRequest(str(e))
else:
return super(UnifiedSearchViewSet, self).list(request)
class LogViewSet(ViewSet):
permission_classes = (IsAuthenticated,)
log_files = ["paperless", "mail"]
def retrieve(self, request, pk=None, *args, **kwargs):
if pk not in self.log_files:
raise Http404()
filename = os.path.join(settings.LOGGING_DIR, f"{pk}.log")
if not os.path.isfile(filename):
raise Http404()
with open(filename, "r") as f:
lines = [line.rstrip() for line in f.readlines()]
return Response(lines)
def list(self, request, *args, **kwargs):
return Response(self.log_files)
class SavedViewViewSet(ModelViewSet):
model = SavedView
queryset = SavedView.objects.all()
serializer_class = SavedViewSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
def get_queryset(self):
user = self.request.user
return SavedView.objects.filter(user=user)
def perform_create(self, serializer):
serializer.save(user=self.request.user)
class BulkEditView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = BulkEditSerializer
parser_classes = (parsers.JSONParser,)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
method = serializer.validated_data.get("method")
parameters = serializer.validated_data.get("parameters")
documents = serializer.validated_data.get("documents")
try:
# TODO: parameter validation
result = method(documents, **parameters)
return Response({"result": result})
except Exception as e:
return HttpResponseBadRequest(str(e))
class PostDocumentView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = PostDocumentSerializer
parser_classes = (parsers.MultiPartParser,)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
doc_name, doc_data = serializer.validated_data.get('document')
correspondent_id = serializer.validated_data.get('correspondent')
document_type_id = serializer.validated_data.get('document_type')
tag_ids = serializer.validated_data.get('tags')
title = serializer.validated_data.get('title')
t = int(mktime(datetime.now().timetuple()))
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
dir=settings.SCRATCH_DIR,
delete=False) as f:
f.write(doc_data)
os.utime(f.name, times=(t, t))
temp_filename = f.name
task_id = str(uuid.uuid4())
async_task("documents.tasks.consume_file",
temp_filename,
override_filename=doc_name,
override_title=title,
override_correspondent_id=correspondent_id,
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_id=task_id,
task_name=os.path.basename(doc_name)[:100])
return Response("OK")
class SelectionDataView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = DocumentListSerializer
parser_classes = (parsers.MultiPartParser, parsers.JSONParser)
def post(self, request, format=None):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
ids = serializer.validated_data.get('documents')
correspondents = Correspondent.objects.annotate(
document_count=Count(Case(
When(documents__id__in=ids, then=1),
output_field=IntegerField()
)))
tags = Tag.objects.annotate(document_count=Count(Case(
When(documents__id__in=ids, then=1),
output_field=IntegerField()
)))
types = DocumentType.objects.annotate(document_count=Count(Case(
When(documents__id__in=ids, then=1),
output_field=IntegerField()
)))
r = Response({
"selected_correspondents": [{
"id": t.id,
"document_count": t.document_count
} for t in correspondents],
"selected_tags": [{
"id": t.id,
"document_count": t.document_count
} for t in tags],
"selected_document_types": [{
"id": t.id,
"document_count": t.document_count
} for t in types]
})
return r
class SearchAutoCompleteView(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, format=None):
if 'term' in request.query_params:
term = request.query_params['term']
else:
return HttpResponseBadRequest("Term required")
if 'limit' in request.query_params:
limit = int(request.query_params['limit'])
if limit <= 0:
return HttpResponseBadRequest("Invalid limit")
else:
limit = 10
from documents import index
ix = index.open_index()
return Response(index.autocomplete(ix, term, limit))
class StatisticsView(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, format=None):
documents_total = Document.objects.all().count()
if Tag.objects.filter(is_inbox_tag=True).exists():
documents_inbox = Document.objects.filter(
tags__is_inbox_tag=True).distinct().count()
else:
documents_inbox = None
return Response({
'documents_total': documents_total,
'documents_inbox': documents_inbox,
})
class BulkDownloadView(GenericAPIView):
permission_classes = (IsAuthenticated,)
serializer_class = BulkDownloadSerializer
parser_classes = (parsers.JSONParser,)
def post(self, request, format=None):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
ids = serializer.validated_data.get('documents')
compression = serializer.validated_data.get('compression')
content = serializer.validated_data.get('content')
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
temp = tempfile.NamedTemporaryFile(
dir=settings.SCRATCH_DIR,
suffix="-compressed-archive",
delete=False)
if content == 'both':
strategy_class = OriginalAndArchiveStrategy
elif content == 'originals':
strategy_class = OriginalsOnlyStrategy
else:
strategy_class = ArchiveOnlyStrategy
with zipfile.ZipFile(temp.name, "w", compression) as zipf:
strategy = strategy_class(zipf)
for id in ids:
doc = Document.objects.get(id=id)
strategy.add_document(doc)
with open(temp.name, "rb") as f:
response = HttpResponse(f, content_type="application/zip")
response["Content-Disposition"] = '{}; filename="{}"'.format(
"attachment", "documents.zip")
return response