From 8b16cd99dc9dd259a73f28cdfe3a141107c2d2f7 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Thu, 3 Dec 2020 18:36:23 +0100 Subject: [PATCH] updated the API, it now supports tags, correspondents, types and title when uploading documents. --- docs/api.rst | 63 ++++++++++++++++++++-- src/documents/forms.py | 59 --------------------- src/documents/serialisers.py | 85 +++++++++++++++++++++++++++++ src/documents/tests/test_api.py | 94 +++++++++++++++++++++++++++++++-- src/documents/views.py | 70 +++++++++++++++++++----- src/paperless/settings.py | 4 +- src/paperless/urls.py | 9 +++- 7 files changed, 302 insertions(+), 82 deletions(-) delete mode 100644 src/documents/forms.py diff --git a/docs/api.rst b/docs/api.rst index 4f41832de..523ca1b45 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -38,6 +38,50 @@ individual documents: are in place. However, if you use these old URLs to access documents, you should update your app or script to use the new URLs. +.. note:: + + The document endpoint provides tags, document types and correspondents as + ids in their corresponding fields. These are writeable. Paperless also + offers read-only objects for assigned tags, types and correspondents, + however, these might be removed in the future. As for now, the front end + requires them. + +Authorization +############# + +The REST api provides three different forms of authentication. + +1. Basic authentication + + Authorize by providing a HTTP header in the form + + .. code:: + + Authorization: Basic + + where ``credentials`` is a base64-encoded string of ``:`` + +2. Session authentication + + When you're logged into paperless in your browser, you're automatically + logged into the API as well and don't need to provide any authorization + headers. + +3. Token authentication + + Paperless also offers an endpoint to acquire authentication tokens. + + POST a username and password as a form or json string to ``/api/token/`` + and paperless will respond with a token, if the login data is correct. + This token can be used to authenticate other requests with the + following HTTP header: + + .. code:: + + Authorization: Token + + Tokens can be managed and revoked in the paperless admin. + Searching for documents ####################### @@ -166,8 +210,19 @@ The API provides a special endpoint for file uploads: POST a multipart form to this endpoint, where the form field ``document`` contains the document that you want to upload to paperless. The filename is sanitized and -then used to store the document in the consumption folder, where the consumer will -detect the document and process it as any other document. +then used to store the document in a temporary directory, and the consumer will +be instructed to consume the document from there. -The endpoint will immediately return "OK." if the document was stored in the -consumption directory. +The endpoint supports the following optional form fields: + +* ``title``: Specify a title that the consumer should use for the document. +* ``correspondent``: Specify a correspondent that the consumer should use for the document. + Case sensitive. If the specified correspondent does not exist, it will be created with this + name and default settings. +* ``document_type``: Similar to correspondent. +* ``tags``: Similar to correspondent. Specify this multiple times to have multiple tags added + to the document. + +The endpoint will immediately return "OK" if the document consumption process +was started successfully. No additional status information about the consumption +process itself is available, since that happens in a different process. diff --git a/src/documents/forms.py b/src/documents/forms.py deleted file mode 100644 index 63dd307b2..000000000 --- a/src/documents/forms.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import tempfile -from datetime import datetime -from time import mktime - -import magic -from django import forms -from django.conf import settings -from django_q.tasks import async_task -from pathvalidate import validate_filename, ValidationError - -from documents.parsers import is_mime_type_supported - - -class UploadForm(forms.Form): - - document = forms.FileField() - - def clean_document(self): - document_name = self.cleaned_data.get("document").name - - try: - validate_filename(document_name) - except ValidationError: - raise forms.ValidationError("That filename is suspicious.") - - document_data = self.cleaned_data.get("document").read() - - mime_type = magic.from_buffer(document_data, mime=True) - - if not is_mime_type_supported(mime_type): - raise forms.ValidationError("This mime type is not supported.") - - return document_name, document_data - - def save(self): - """ - Since the consumer already does a lot of work, it's easier just to save - to-be-consumed files to the consumption directory rather than have the - form do that as well. Think of it as a poor-man's queue server. - """ - - original_filename, data = self.cleaned_data.get("document") - - t = int(mktime(datetime.now().timetuple())) - - os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - - with tempfile.NamedTemporaryFile(prefix="paperless-upload-", - dir=settings.SCRATCH_DIR, - delete=False) as f: - - f.write(data) - os.utime(f.name, times=(t, t)) - - async_task("documents.tasks.consume_file", - f.name, - override_filename=original_filename, - task_name=os.path.basename(original_filename)[:100]) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index c86aa8c83..14102df5c 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1,6 +1,9 @@ +import magic +from pathvalidate import validate_filename, ValidationError from rest_framework import serializers from .models import Correspondent, Tag, Document, Log, DocumentType +from .parsers import is_mime_type_supported class CorrespondentSerializer(serializers.HyperlinkedModelSerializer): @@ -113,3 +116,85 @@ class LogSerializer(serializers.ModelSerializer): "group", "level" ) + + +class PostDocumentSerializer(serializers.Serializer): + + document = serializers.FileField( + label="Document", + write_only=True, + ) + + title = serializers.CharField( + label="Title", + write_only=True, + required=False, + ) + + correspondent = serializers.CharField( + label="Correspondent", + write_only=True, + required=False, + ) + + document_type = serializers.CharField( + label="Document type", + write_only=True, + required=False, + ) + + tags = serializers.ListField( + child=serializers.CharField(), + label="Tags", + source="tag", + write_only=True, + required=False, + ) + + def validate(self, attrs): + document = attrs.get('document') + + try: + validate_filename(document.name) + except ValidationError: + raise serializers.ValidationError("Invalid filename.") + + document_data = document.file.read() + mime_type = magic.from_buffer(document_data, mime=True) + + if not is_mime_type_supported(mime_type): + raise serializers.ValidationError( + "This mime type is not supported.") + + attrs['document_data'] = document_data + + title = attrs.get('title') + + if not title: + attrs['title'] = None + + correspondent = attrs.get('correspondent') + if correspondent: + c, _ = Correspondent.objects.get_or_create(name=correspondent) + attrs['correspondent_id'] = c.id + else: + attrs['correspondent_id'] = None + + document_type = attrs.get('document_type') + if document_type: + dt, _ = DocumentType.objects.get_or_create(name=document_type) + attrs['document_type_id'] = dt.id + else: + attrs['document_type_id'] = None + + tags = attrs.get('tag') + if tags: + tag_ids = [] + for tag in tags: + tag, _ = Tag.objects.get_or_create(name=tag) + tag_ids.append(tag.id) + attrs['tag_ids'] = tag_ids + else: + attrs['tag_ids'] = None + + return attrs diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 01eb17b49..e2e1b254e 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -358,7 +358,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.data['documents_total'], 3) self.assertEqual(response.data['documents_inbox'], 1) - @mock.patch("documents.forms.async_task") + @mock.patch("documents.views.async_task") def test_upload(self, m): with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: @@ -370,8 +370,12 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): args, kwargs = m.call_args self.assertEqual(kwargs['override_filename'], "simple.pdf") + self.assertIsNone(kwargs['override_title']) + self.assertIsNone(kwargs['override_correspondent_id']) + self.assertIsNone(kwargs['override_document_type_id']) + self.assertIsNone(kwargs['override_tag_ids']) - @mock.patch("documents.forms.async_task") + @mock.patch("documents.views.async_task") def test_upload_invalid_form(self, m): with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: @@ -379,7 +383,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 400) m.assert_not_called() - @mock.patch("documents.forms.async_task") + @mock.patch("documents.views.async_task") def test_upload_invalid_file(self, m): with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f: @@ -387,8 +391,8 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 400) m.assert_not_called() - @mock.patch("documents.forms.async_task") - @mock.patch("documents.forms.validate_filename") + @mock.patch("documents.views.async_task") + @mock.patch("documents.serialisers.validate_filename") def test_upload_invalid_filename(self, validate_filename, async_task): validate_filename.side_effect = ValidationError() with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: @@ -396,3 +400,83 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 400) async_task.assert_not_called() + + @mock.patch("documents.views.async_task") + def test_upload_with_title(self, async_task): + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post("/api/documents/post_document/", {"document": f, "title": "my custom title"}) + self.assertEqual(response.status_code, 200) + + async_task.assert_called_once() + + args, kwargs = async_task.call_args + + self.assertEqual(kwargs['override_title'], "my custom title") + + @mock.patch("documents.views.async_task") + def test_upload_with_correspondent(self, async_task): + c = Correspondent.objects.create(name="test-corres") + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": "test-corres"}) + self.assertEqual(response.status_code, 200) + + async_task.assert_called_once() + + args, kwargs = async_task.call_args + + self.assertEqual(kwargs['override_correspondent_id'], c.id) + + @mock.patch("documents.views.async_task") + def test_upload_with_new_correspondent(self, async_task): + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": "test-corres2"}) + self.assertEqual(response.status_code, 200) + + async_task.assert_called_once() + + args, kwargs = async_task.call_args + + c = Correspondent.objects.get(name="test-corres2") + self.assertEqual(kwargs['override_correspondent_id'], c.id) + + @mock.patch("documents.views.async_task") + def test_upload_with_document_type(self, async_task): + dt = DocumentType.objects.create(name="invoice") + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": "invoice"}) + self.assertEqual(response.status_code, 200) + + async_task.assert_called_once() + + args, kwargs = async_task.call_args + + self.assertEqual(kwargs['override_document_type_id'], dt.id) + + @mock.patch("documents.views.async_task") + def test_upload_with_new_document_type(self, async_task): + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": "invoice2"}) + self.assertEqual(response.status_code, 200) + + async_task.assert_called_once() + + args, kwargs = async_task.call_args + + dt = DocumentType.objects.get(name="invoice2") + self.assertEqual(kwargs['override_document_type_id'], dt.id) + + @mock.patch("documents.views.async_task") + def test_upload_with_tags(self, async_task): + t1 = Tag.objects.create(name="tag1") + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post( + "/api/documents/post_document/", + {"document": f, "tags": ["tag1", "tag2"]}) + self.assertEqual(response.status_code, 200) + + async_task.assert_called_once() + + args, kwargs = async_task.call_args + + t2 = Tag.objects.get(name="tag2") + self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id]) diff --git a/src/documents/views.py b/src/documents/views.py index 922854f57..adef757ef 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,10 +1,16 @@ import os +import tempfile +from datetime import datetime +from time import mktime +from django.conf import settings from django.db.models import Count, Max from django.http import HttpResponse, HttpResponseBadRequest, Http404 from django.views.decorators.cache import cache_control from django.views.generic import TemplateView from django_filters.rest_framework import DjangoFilterBackend +from django_q.tasks import async_task +from rest_framework import parsers from rest_framework.decorators import action from rest_framework.filters import OrderingFilter, SearchFilter from rest_framework.mixins import ( @@ -32,14 +38,14 @@ from .filters import ( DocumentTypeFilterSet, LogFilterSet ) -from .forms import UploadForm from .models import Correspondent, Document, Log, Tag, DocumentType from .serialisers import ( CorrespondentSerializer, DocumentSerializer, LogSerializer, TagSerializer, - DocumentTypeSerializer + DocumentTypeSerializer, + PostDocumentSerializer ) @@ -154,16 +160,6 @@ class DocumentViewSet(RetrieveModelMixin, disposition, filename) return response - @action(methods=['post'], detail=False) - def post_document(self, request, pk=None): - # TODO: is this a good implementation? - form = UploadForm(data=request.POST, files=request.FILES) - if form.is_valid(): - form.save() - return Response("OK") - else: - return HttpResponseBadRequest(str(form.errors)) - @action(methods=['get'], detail=True) def metadata(self, request, pk=None): try: @@ -217,6 +213,56 @@ class LogViewSet(ReadOnlyModelViewSet): ordering_fields = ("created",) +class PostDocumentView(APIView): + + permission_classes = (IsAuthenticated,) + serializer_class = PostDocumentSerializer + parser_classes = (parsers.MultiPartParser,) + + def get_serializer_context(self): + return { + 'request': self.request, + 'format': self.format_kwarg, + 'view': self + } + + def get_serializer(self, *args, **kwargs): + kwargs['context'] = self.get_serializer_context() + return self.serializer_class(*args, **kwargs) + + def post(self, request, *args, **kwargs): + + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + document = serializer.validated_data['document'] + document_data = serializer.validated_data['document_data'] + correspondent_id = serializer.validated_data['correspondent_id'] + document_type_id = serializer.validated_data['document_type_id'] + tag_ids = serializer.validated_data['tag_ids'] + title = serializer.validated_data['title'] + + t = int(mktime(datetime.now().timetuple())) + + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + + with tempfile.NamedTemporaryFile(prefix="paperless-upload-", + dir=settings.SCRATCH_DIR, + delete=False) as f: + f.write(document_data) + os.utime(f.name, times=(t, t)) + + async_task("documents.tasks.consume_file", + f.name, + override_filename=document.name, + override_title=title, + override_correspondent_id=correspondent_id, + override_document_type_id=document_type_id, + override_tag_ids=tag_ids, + task_name=os.path.basename(document.name)[:100]) + return Response("OK") + + class SearchView(APIView): permission_classes = (IsAuthenticated,) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 410b8454a..88915c7c5 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -86,6 +86,7 @@ INSTALLED_APPS = [ "django.contrib.admin", "rest_framework", + "rest_framework.authtoken", "django_filters", "django_q", @@ -95,7 +96,8 @@ INSTALLED_APPS = [ REST_FRAMEWORK = { 'DEFAULT_AUTHENTICATION_CLASSES': [ 'rest_framework.authentication.BasicAuthentication', - 'rest_framework.authentication.SessionAuthentication' + 'rest_framework.authentication.SessionAuthentication', + 'rest_framework.authentication.TokenAuthentication' ] } diff --git a/src/paperless/urls.py b/src/paperless/urls.py index dd5e6a379..9b390b139 100755 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -4,6 +4,7 @@ from django.contrib.auth.decorators import login_required from django.urls import path, re_path from django.views.decorators.csrf import csrf_exempt from django.views.generic import RedirectView +from rest_framework.authtoken import views from rest_framework.routers import DefaultRouter from documents.views import ( @@ -15,7 +16,8 @@ from documents.views import ( SearchView, IndexView, SearchAutoCompleteView, - StatisticsView + StatisticsView, + PostDocumentView ) from paperless.views import FaviconView @@ -45,6 +47,11 @@ urlpatterns = [ StatisticsView.as_view(), name="statistics"), + re_path(r"^documents/post_document/", PostDocumentView.as_view(), + name="post_document"), + + path('token/', views.obtain_auth_token) + ] + api_router.urls)), re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),