Merge branch 'dev' into feature-permissions

This commit is contained in:
Michael Shamoon
2023-01-05 19:45:12 -08:00
42 changed files with 2587 additions and 7213 deletions

View File

@@ -5,6 +5,7 @@ from contextlib import contextmanager
from dateutil.parser import isoparse
from django.conf import settings
from documents.models import Comment
from documents.models import Document
from guardian.shortcuts import get_users_with_perms
from whoosh import classify
@@ -50,6 +51,7 @@ def get_schema():
path=TEXT(sortable=True),
path_id=NUMERIC(),
has_path=BOOLEAN(),
comments=TEXT(),
owner=TEXT(),
owner_id=NUMERIC(),
has_owner=BOOLEAN(),
@@ -95,6 +97,7 @@ def open_index_searcher():
def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
users_with_perms = get_users_with_perms(
doc,
only_with_perms_in=["view_document"],
@@ -120,6 +123,7 @@ def update_document(writer, doc):
path=doc.storage_path.name if doc.storage_path else None,
path_id=doc.storage_path.id if doc.storage_path else None,
has_path=doc.storage_path is not None,
comments=comments,
owner=doc.owner.username if doc.owner else None,
owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None,
@@ -276,7 +280,7 @@ class DelayedFullTextQuery(DelayedQuery):
def _get_query(self):
q_str = self.query_params["query"]
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
["content", "title", "correspondent", "tag", "type", "comments"],
self.searcher.ixreader.schema,
)
qp.add_plugin(DateParserPlugin())

View File

@@ -6,12 +6,12 @@ import re
import shutil
import subprocess
import tempfile
from functools import cache
from typing import Iterator
from typing import Match
from typing import Optional
from typing import Set
import magic
from django.conf import settings
from django.utils import timezone
from documents.loggers import LoggingMixin
@@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
logger = logging.getLogger("paperless.parsing")
def is_mime_type_supported(mime_type) -> bool:
@cache
def is_mime_type_supported(mime_type: str) -> bool:
"""
Returns True if the mime type is supported, False otherwise
"""
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type) -> str:
@cache
def get_default_file_extension(mime_type: str) -> str:
"""
Returns the default file extension for a mimetype, or
an empty string if it could not be determined
"""
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
@@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
return ""
def is_file_ext_supported(ext) -> bool:
@cache
def is_file_ext_supported(ext: str) -> bool:
"""
Returns True if the file extension is supported, False otherwise
TODO: Investigate why this really exists, why not use mimetype
"""
if ext:
return ext.lower() in get_supported_file_extensions()
else:
@@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
# Python's stdlib might be behind, so also add what the parser
# says is the default extension
# This makes image/webp supported on Python < 3.11
extensions.add(supported_mime_types[mime_type])
return extensions
def get_parser_class_for_mime_type(mime_type):
def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
"""
Returns the best parser (by weight) for the given mimetype or
None if no parser exists
"""
options = []
@@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def get_parser_class(path):
"""
Determine the appropriate parser class based on the file
"""
mime_type = magic.from_file(path, mime=True)
return get_parser_class_for_mime_type(mime_type)
def run_convert(
input_file,
output_file,

View File

@@ -1,5 +1,3 @@
from unittest import mock
from django.contrib.admin.sites import AdminSite
from django.test import TestCase
from django.utils import timezone

View File

@@ -35,7 +35,6 @@ from documents.models import SavedView
from documents.models import StoragePath
from documents.models import Tag
from documents.models import Comment
from documents.models import StoragePath
from documents.tests.utils import DirectoriesMixin
from paperless import version
from rest_framework.test import APITestCase
@@ -484,7 +483,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertNotIn(result["id"], seen_ids)
seen_ids.append(result["id"])
response = self.client.get(f"/api/documents/?query=content&page=6&page_size=10")
response = self.client.get("/api/documents/?query=content&page=6&page_size=10")
results = response.data["results"]
self.assertEqual(response.data["count"], 55)
self.assertEqual(len(results), 5)
@@ -504,9 +503,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
)
index.update_document(writer, doc)
response = self.client.get(f"/api/documents/?query=content&page=0&page_size=10")
response = self.client.get("/api/documents/?query=content&page=0&page_size=10")
self.assertEqual(response.status_code, 404)
response = self.client.get(f"/api/documents/?query=content&page=3&page_size=10")
response = self.client.get("/api/documents/?query=content&page=3&page_size=10")
self.assertEqual(response.status_code, 404)
@mock.patch("documents.index.autocomplete")
@@ -1084,7 +1083,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(meta["archive_size"], os.stat(archive_file).st_size)
def test_get_metadata_invalid_doc(self):
response = self.client.get(f"/api/documents/34576/metadata/")
response = self.client.get("/api/documents/34576/metadata/")
self.assertEqual(response.status_code, 404)
def test_get_metadata_no_archive(self):
@@ -1149,7 +1148,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
)
def test_get_suggestions_invalid_doc(self):
response = self.client.get(f"/api/documents/34676/suggestions/")
response = self.client.get("/api/documents/34676/suggestions/")
self.assertEqual(response.status_code, 404)
@mock.patch("documents.views.match_storage_paths")

View File

@@ -401,7 +401,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(
cm.output,
[
f"WARNING:paperless.barcodes:No pages to split on!",
"WARNING:paperless.barcodes:No pages to split on!",
],
)

View File

@@ -1,5 +1,4 @@
import textwrap
import unittest
from unittest import mock
from django.core.checks import Error

View File

@@ -1,5 +1,6 @@
import os
import re
import shutil
import tempfile
from pathlib import Path
from unittest import mock
@@ -27,6 +28,9 @@ def dummy_preprocess(content: str):
class TestClassifier(DirectoriesMixin, TestCase):
SAMPLE_MODEL_FILE = os.path.join(os.path.dirname(__file__), "data", "model.pickle")
def setUp(self):
super().setUp()
self.classifier = DocumentClassifier()
@@ -213,13 +217,14 @@ class TestClassifier(DirectoriesMixin, TestCase):
# self.classifier.train()
# self.classifier.save()
@override_settings(
MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"),
)
def test_load_and_classify(self):
# Generate test data, train and save to the model file
# This ensures the model file sklearn version matches
# and eliminates a warning
shutil.copy(
self.SAMPLE_MODEL_FILE,
os.path.join(self.dirs.data_dir, "classification_model.pickle"),
)
self.generate_test_data()
self.classifier.train()
self.classifier.save()
@@ -230,9 +235,6 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
@override_settings(
MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"),
)
@mock.patch("documents.classifier.pickle.load")
def test_load_corrupt_file(self, patched_pickle_load):
"""
@@ -243,6 +245,10 @@ class TestClassifier(DirectoriesMixin, TestCase):
THEN:
- The ClassifierModelCorruptError is raised
"""
shutil.copy(
self.SAMPLE_MODEL_FILE,
os.path.join(self.dirs.data_dir, "classification_model.pickle"),
)
# First load is the schema version
patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]

View File

@@ -4,7 +4,6 @@ import re
import shutil
import stat
import tempfile
from subprocess import CalledProcessError
from unittest import mock
from unittest.mock import MagicMock

View File

@@ -9,7 +9,6 @@ from django.test import override_settings
from django.test import TestCase
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
from paperless.settings import DATE_ORDER
class TestDate(TestCase):

View File

@@ -88,10 +88,10 @@ class TestArchiver(DirectoriesMixin, TestCase):
mime_type="application/pdf",
filename="document_01.pdf",
)
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, f"document.pdf"))
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "document.pdf"))
shutil.copy(
sample_file,
os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
os.path.join(self.dirs.originals_dir, "document_01.pdf"),
)
update_document_archive_file(doc2.pk)
@@ -150,7 +150,7 @@ class TestDecryptDocuments(TestCase):
"samples",
"documents",
"thumbnails",
f"0000004.webp.gpg",
"0000004.webp.gpg",
),
os.path.join(thumb_dir, f"{doc.id:07}.webp.gpg"),
)

View File

@@ -5,10 +5,7 @@ from unittest import mock
from django.core.management import call_command
from django.test import TestCase
from documents.management.commands.document_thumbnails import _process_document
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import Tag
from documents.tests.utils import DirectoriesMixin

View File

@@ -7,7 +7,6 @@ from typing import Union
from unittest import mock
from django.test import override_settings
from documents.tests.test_migration_archive_files import thumbnail_path
from documents.tests.utils import TestMigrations

View File

@@ -1,14 +1,8 @@
import os
import shutil
import tempfile
from tempfile import TemporaryDirectory
from unittest import mock
from django.test import override_settings
from django.test import TestCase
from documents.parsers import DocumentParser
from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
@@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_1_parser(self, m, *args):
def test_get_parser_class_1_parser(self, m, *args):
"""
GIVEN:
- Parser declared for a given mimetype
WHEN:
- Attempt to get parser for the mimetype
THEN:
- Declared parser class is returned
"""
class DummyParser:
pass
@@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
),
)
self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_n_parsers(self, m, *args):
def test_get_parser_class_n_parsers(self, m, *args):
"""
GIVEN:
- Two parsers declared for a given mimetype
- Second parser has a higher weight
WHEN:
- Attempt to get parser for the mimetype
THEN:
- Second parser class is returned
"""
class DummyParser1:
pass
@@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
),
)
self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
self.assertEqual(
get_parser_class_for_mime_type("application/pdf"),
DummyParser2,
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
def test_get_parser_class_0_parsers(self, m, *args):
"""
GIVEN:
- No parsers are declared
WHEN:
- Attempt to get parser for the mimetype
THEN:
- No parser class is returned
"""
m.return_value = []
with TemporaryDirectory() as tmpdir:
self.assertIsNone(get_parser_class("doc.pdf"))
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_get_parser_class_no_valid_parser(self, m, *args):
"""
GIVEN:
- No parser declared for a given mimetype
- Parser declared for a different mimetype
WHEN:
- Attempt to get parser for the given mimetype
THEN:
- No parser class is returned
"""
def fake_get_thumbnail(self, path, mimetype, file_name):
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
class DummyParser:
pass
m.return_value = (
(
None,
{
"weight": 0,
"parser": DummyParser,
"mime_types": {"application/pdf": ".pdf"},
},
),
)
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
class TestParserAvailability(TestCase):
def test_file_extensions(self):
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
self.assertIn(ext, get_supported_file_extensions())
self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
self.assertEqual(get_default_file_extension("image/png"), ".png")
self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
self.assertEqual(get_default_file_extension("text/plain"), ".txt")
self.assertEqual(get_default_file_extension("text/csv"), ".csv")
supported_mimes_and_exts = [
("application/pdf", ".pdf"),
("image/png", ".png"),
("image/jpeg", ".jpg"),
("image/tiff", ".tif"),
("image/webp", ".webp"),
("text/plain", ".txt"),
("text/csv", ".csv"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
# Test no parser declared still returns a an extension
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
# Test invalid mimetype returns no extension
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
self.assertIsInstance(
@@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
get_parser_class_for_mime_type("text/plain")(logging_group=None),
TextDocumentParser,
)
self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
self.assertTrue(is_file_ext_supported(".pdf"))
self.assertFalse(is_file_ext_supported(".hsdfh"))

View File

@@ -494,10 +494,19 @@ class DocumentViewSet(
class SearchResultSerializer(DocumentSerializer, PassUserMixin):
def to_representation(self, instance):
doc = Document.objects.get(id=instance["id"])
commentTerm = instance.results.q.subqueries[0]
comments = ",".join(
[
str(c.comment)
for c in Comment.objects.filter(document=instance["id"])
if commentTerm.text in c.comment
],
)
r = super().to_representation(doc)
r["__search_hit__"] = {
"score": instance.score,
"highlights": instance.highlights("content", text=doc.content)
"highlights": instance.highlights("content", text=doc.content),
"comment_highlights": instance.highlights("content", text=comments)
if doc
else None,
"rank": instance.rank,