- {{(document.correspondent$ | async)?.name}}:
+ {{(document.correspondent$ | async)?.name ?? privateName}}:
{{document.title | documentTitle}}
@@ -41,14 +41,14 @@
-
{{(document.document_type$ | async)?.name}}
+
{{(document.document_type$ | async)?.name ?? privateName}}
diff --git a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts
index 62f44851e..3dd64818d 100644
--- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts
+++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts
@@ -76,6 +76,10 @@ export class DocumentCardSmallComponent extends ComponentWithPermissions {
return this.documentService.getPreviewUrl(this.document.id)
}
+ get privateName() {
+ return $localize`Private`
+ }
+
getTagsLimited$() {
const limit = this.document.notes.length > 0 ? 6 : 7
return this.document.tags$.pipe(
From 97cd06d2ba7882dab826f4bdf73b2080638bc37e Mon Sep 17 00:00:00 2001
From: jayme-github
Date: Sun, 4 Jun 2023 19:34:27 +0200
Subject: [PATCH 03/23] Feature: Allow to filter documents by original filename
and checksum (#3485)
* Allow to filter documents by original filename and checksum
This adds filters for the original filename and checksum of documents to
be able to to lazy checks if the file is already stored in paperless.
* Add tests for DelayedQuery
* Add checksum and original_filename to whoosh index and DelayedQuery
* Refactored DelayedQuery to reduce duplicate code
* Choose icontains for checksums as whoosh has no exact match query term
* Bumped index version
* Revert whoosh filtering logic to simpler structure, remove redundant tests
Revert "Revert whoosh filtering logic to simpler structure, remove redundant tests"
This reverts commit 86792174bfbc697f42b72c4b39ee9eba483bb425.
---------
Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
---
docker/docker-prepare.sh | 2 +-
src/documents/filters.py | 2 +
src/documents/index.py | 140 +++++++--------
src/documents/tests/test_api.py | 82 ++++++++-
src/documents/tests/test_delayedquery.py | 219 +++++++++++++++++++++++
5 files changed, 372 insertions(+), 73 deletions(-)
create mode 100644 src/documents/tests/test_delayedquery.py
diff --git a/docker/docker-prepare.sh b/docker/docker-prepare.sh
index 9cf41d42c..6e5f6889a 100755
--- a/docker/docker-prepare.sh
+++ b/docker/docker-prepare.sh
@@ -80,7 +80,7 @@ django_checks() {
search_index() {
- local -r index_version=5
+ local -r index_version=6
local -r index_version_file=${DATA_DIR}/.index_version
if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then
diff --git a/src/documents/filters.py b/src/documents/filters.py
index 53ef0391c..c7f35a6d5 100644
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -116,6 +116,8 @@ class DocumentFilterSet(FilterSet):
"created": DATE_KWARGS,
"added": DATE_KWARGS,
"modified": DATE_KWARGS,
+ "original_filename": CHAR_KWARGS,
+ "checksum": CHAR_KWARGS,
"correspondent": ["isnull"],
"correspondent__id": ID_KWARGS,
"correspondent__name": CHAR_KWARGS,
diff --git a/src/documents/index.py b/src/documents/index.py
index 054a931e0..9a7505da3 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -64,6 +64,8 @@ def get_schema():
owner_id=NUMERIC(),
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
+ checksum=TEXT(),
+ original_filename=TEXT(sortable=True),
)
@@ -149,6 +151,8 @@ def update_document(writer: AsyncWriter, doc: Document):
owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
+ checksum=doc.checksum,
+ original_filename=doc.original_filename,
)
@@ -171,91 +175,85 @@ def remove_document_from_index(document):
class DelayedQuery:
+ param_map = {
+ "correspondent": ("correspondent", ["id", "id__in", "id__none", "isnull"]),
+ "document_type": ("type", ["id", "id__in", "id__none", "isnull"]),
+ "storage_path": ("path", ["id", "id__in", "id__none", "isnull"]),
+ "owner": ("owner", ["id", "id__in", "id__none", "isnull"]),
+ "tags": ("tag", ["id__all", "id__in", "id__none"]),
+ "added": ("added", ["date__lt", "date__gt"]),
+ "created": ("created", ["date__lt", "date__gt"]),
+ "checksum": ("checksum", ["icontains", "istartswith"]),
+ "original_filename": ("original_filename", ["icontains", "istartswith"]),
+ }
+
def _get_query(self):
raise NotImplementedError
def _get_query_filter(self):
criterias = []
- for k, v in self.query_params.items():
- if k == "correspondent__id":
- criterias.append(query.Term("correspondent_id", v))
- elif k == "correspondent__id__in":
- correspondents_in = []
- for correspondent_id in v.split(","):
- correspondents_in.append(
- query.Term("correspondent_id", correspondent_id),
+ for key, value in self.query_params.items():
+ # is_tagged is a special case
+ if key == "is_tagged":
+ criterias.append(query.Term("has_tag", self.evalBoolean(value)))
+ continue
+
+ # Don't process query params without a filter
+ if "__" not in key:
+ continue
+
+ # All other query params consist of a parameter and a query filter
+ param, query_filter = key.split("__", 1)
+ try:
+ field, supported_query_filters = self.param_map[param]
+ except KeyError:
+ logger.error("Unable to build a query filter for parameter %s", key)
+ continue
+
+ # We only support certain filters per parameter
+ if query_filter not in supported_query_filters:
+ logger.info(
+ f"Query filter {query_filter} not supported for parameter {param}",
+ )
+ continue
+
+ if query_filter == "id":
+ criterias.append(query.Term(f"{field}_id", value))
+ elif query_filter == "id__in":
+ in_filter = []
+ for object_id in value.split(","):
+ in_filter.append(
+ query.Term(f"{field}_id", object_id),
)
- criterias.append(query.Or(correspondents_in))
- elif k == "correspondent__id__none":
- for correspondent_id in v.split(","):
+ criterias.append(query.Or(in_filter))
+ elif query_filter == "id__none":
+ for object_id in value.split(","):
criterias.append(
- query.Not(query.Term("correspondent_id", correspondent_id)),
+ query.Not(query.Term(f"{field}_id", object_id)),
)
- elif k == "tags__id__all":
- for tag_id in v.split(","):
- criterias.append(query.Term("tag_id", tag_id))
- elif k == "tags__id__none":
- for tag_id in v.split(","):
- criterias.append(query.Not(query.Term("tag_id", tag_id)))
- elif k == "tags__id__in":
- tags_in = []
- for tag_id in v.split(","):
- tags_in.append(query.Term("tag_id", tag_id))
- criterias.append(query.Or(tags_in))
- elif k == "document_type__id":
- criterias.append(query.Term("type_id", v))
- elif k == "document_type__id__in":
- document_types_in = []
- for document_type_id in v.split(","):
- document_types_in.append(query.Term("type_id", document_type_id))
- criterias.append(query.Or(document_types_in))
- elif k == "document_type__id__none":
- for document_type_id in v.split(","):
- criterias.append(query.Not(query.Term("type_id", document_type_id)))
- elif k == "correspondent__isnull":
+ elif query_filter == "isnull":
criterias.append(
- query.Term("has_correspondent", self.evalBoolean(v) is False),
+ query.Term(f"has_{field}", self.evalBoolean(value) is False),
)
- elif k == "is_tagged":
- criterias.append(query.Term("has_tag", self.evalBoolean(v)))
- elif k == "document_type__isnull":
- criterias.append(query.Term("has_type", self.evalBoolean(v) is False))
- elif k == "created__date__lt":
+ elif query_filter == "id__all":
+ for object_id in value.split(","):
+ criterias.append(query.Term(f"{field}_id", object_id))
+ elif query_filter == "date__lt":
criterias.append(
- query.DateRange("created", start=None, end=isoparse(v)),
+ query.DateRange(field, start=None, end=isoparse(value)),
)
- elif k == "created__date__gt":
+ elif query_filter == "date__gt":
criterias.append(
- query.DateRange("created", start=isoparse(v), end=None),
+ query.DateRange(field, start=isoparse(value), end=None),
+ )
+ elif query_filter == "icontains":
+ criterias.append(
+ query.Term(field, value),
+ )
+ elif query_filter == "istartswith":
+ criterias.append(
+ query.Prefix(field, value),
)
- elif k == "added__date__gt":
- criterias.append(query.DateRange("added", start=isoparse(v), end=None))
- elif k == "added__date__lt":
- criterias.append(query.DateRange("added", start=None, end=isoparse(v)))
- elif k == "storage_path__id":
- criterias.append(query.Term("path_id", v))
- elif k == "storage_path__id__in":
- storage_paths_in = []
- for storage_path_id in v.split(","):
- storage_paths_in.append(query.Term("path_id", storage_path_id))
- criterias.append(query.Or(storage_paths_in))
- elif k == "storage_path__id__none":
- for storage_path_id in v.split(","):
- criterias.append(query.Not(query.Term("path_id", storage_path_id)))
- elif k == "storage_path__isnull":
- criterias.append(query.Term("has_path", self.evalBoolean(v) is False))
- elif k == "owner__isnull":
- criterias.append(query.Term("has_owner", self.evalBoolean(v) is False))
- elif k == "owner__id":
- criterias.append(query.Term("owner_id", v))
- elif k == "owner__id__in":
- owners_in = []
- for owner_id in v.split(","):
- owners_in.append(query.Term("owner_id", owner_id))
- criterias.append(query.Or(owners_in))
- elif k == "owner__id__none":
- for owner_id in v.split(","):
- criterias.append(query.Not(query.Term("owner_id", owner_id)))
user_criterias = get_permissions_criterias(
user=self.user,
diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py
index 96db370ae..82f5b219a 100644
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -420,6 +420,74 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
results = response.data["results"]
self.assertEqual(len(results), 0)
+ def test_document_checksum_filter(self):
+ Document.objects.create(
+ title="none1",
+ checksum="A",
+ mime_type="application/pdf",
+ )
+ doc2 = Document.objects.create(
+ title="none2",
+ checksum="B",
+ mime_type="application/pdf",
+ )
+ Document.objects.create(
+ title="none3",
+ checksum="C",
+ mime_type="application/pdf",
+ )
+
+ response = self.client.get("/api/documents/?checksum__iexact=B")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ results = response.data["results"]
+ self.assertEqual(len(results), 1)
+ self.assertEqual(results[0]["id"], doc2.id)
+
+ response = self.client.get("/api/documents/?checksum__iexact=X")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ results = response.data["results"]
+ self.assertEqual(len(results), 0)
+
+ def test_document_original_filename_filter(self):
+ doc1 = Document.objects.create(
+ title="none1",
+ checksum="A",
+ mime_type="application/pdf",
+ original_filename="docA.pdf",
+ )
+ doc2 = Document.objects.create(
+ title="none2",
+ checksum="B",
+ mime_type="application/pdf",
+ original_filename="docB.pdf",
+ )
+ doc3 = Document.objects.create(
+ title="none3",
+ checksum="C",
+ mime_type="application/pdf",
+ original_filename="docC.pdf",
+ )
+
+ response = self.client.get("/api/documents/?original_filename__iexact=DOCa.pdf")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ results = response.data["results"]
+ self.assertEqual(len(results), 1)
+ self.assertEqual(results[0]["id"], doc1.id)
+
+ response = self.client.get("/api/documents/?original_filename__iexact=docx.pdf")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ results = response.data["results"]
+ self.assertEqual(len(results), 0)
+
+ response = self.client.get("/api/documents/?original_filename__istartswith=dOc")
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+ results = response.data["results"]
+ self.assertEqual(len(results), 3)
+ self.assertCountEqual(
+ [results[0]["id"], results[1]["id"], results[2]["id"]],
+ [doc1.id, doc2.id, doc3.id],
+ )
+
def test_documents_title_content_filter(self):
doc1 = Document.objects.create(
title="title A",
@@ -1086,17 +1154,19 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
checksum="4",
created=timezone.make_aware(datetime.datetime(2020, 7, 13)),
content="test",
+ original_filename="doc4.pdf",
)
d4.tags.add(t2)
d5 = Document.objects.create(
checksum="5",
added=timezone.make_aware(datetime.datetime(2020, 7, 13)),
content="test",
+ original_filename="doc5.pdf",
)
Document.objects.create(checksum="6", content="test2")
d7 = Document.objects.create(checksum="7", storage_path=sp, content="test")
d8 = Document.objects.create(
- checksum="8",
+ checksum="foo",
correspondent=c2,
document_type=dt2,
storage_path=sp2,
@@ -1239,6 +1309,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
),
)
+ self.assertEqual(
+ search_query("&checksum__icontains=foo"),
+ [d8.id],
+ )
+
+ self.assertCountEqual(
+ search_query("&original_filename__istartswith=doc"),
+ [d4.id, d5.id],
+ )
+
def test_search_filtering_respect_owner(self):
"""
GIVEN:
diff --git a/src/documents/tests/test_delayedquery.py b/src/documents/tests/test_delayedquery.py
new file mode 100644
index 000000000..962df7192
--- /dev/null
+++ b/src/documents/tests/test_delayedquery.py
@@ -0,0 +1,219 @@
+from dateutil.parser import isoparse
+from django.test import TestCase
+from whoosh import query
+
+from documents.index import DelayedQuery
+from documents.index import get_permissions_criterias
+from documents.models import User
+
+
+class TestDelayedQuery(TestCase):
+ def setUp(self):
+ super().setUp()
+ # all tests run without permission criteria, so has_no_owner query will always
+ # be appended.
+ self.has_no_owner = query.Or([query.Term("has_owner", False)])
+
+ def _get_testset__id__in(self, param, field):
+ return (
+ {f"{param}__id__in": "42,43"},
+ query.And(
+ [
+ query.Or(
+ [
+ query.Term(f"{field}_id", "42"),
+ query.Term(f"{field}_id", "43"),
+ ],
+ ),
+ self.has_no_owner,
+ ],
+ ),
+ )
+
+ def _get_testset__id__none(self, param, field):
+ return (
+ {f"{param}__id__none": "42,43"},
+ query.And(
+ [
+ query.Not(query.Term(f"{field}_id", "42")),
+ query.Not(query.Term(f"{field}_id", "43")),
+ self.has_no_owner,
+ ],
+ ),
+ )
+
+ def test_get_permission_criteria(self):
+ # tests contains touples of user instances and the expected filter
+ tests = (
+ (None, [query.Term("has_owner", False)]),
+ (User(42, username="foo", is_superuser=True), []),
+ (
+ User(42, username="foo", is_superuser=False),
+ [
+ query.Term("has_owner", False),
+ query.Term("owner_id", 42),
+ query.Term("viewer_id", "42"),
+ ],
+ ),
+ )
+ for user, expected in tests:
+ self.assertEqual(get_permissions_criterias(user), expected)
+
+ def test_no_query_filters(self):
+ dq = DelayedQuery(None, {}, None, None)
+ self.assertEqual(dq._get_query_filter(), self.has_no_owner)
+
+ def test_date_query_filters(self):
+ def _get_testset(param: str):
+ date_str = "1970-01-01T02:44"
+ date_obj = isoparse(date_str)
+ return (
+ (
+ {f"{param}__date__lt": date_str},
+ query.And(
+ [
+ query.DateRange(param, start=None, end=date_obj),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ (
+ {f"{param}__date__gt": date_str},
+ query.And(
+ [
+ query.DateRange(param, start=date_obj, end=None),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ )
+
+ query_params = ["created", "added"]
+ for param in query_params:
+ for params, expected in _get_testset(param):
+ dq = DelayedQuery(None, params, None, None)
+ got = dq._get_query_filter()
+ self.assertCountEqual(got, expected)
+
+ def test_is_tagged_query_filter(self):
+ tests = (
+ ("True", True),
+ ("true", True),
+ ("1", True),
+ ("False", False),
+ ("false", False),
+ ("0", False),
+ ("foo", False),
+ )
+ for param, expected in tests:
+ dq = DelayedQuery(None, {"is_tagged": param}, None, None)
+ self.assertEqual(
+ dq._get_query_filter(),
+ query.And([query.Term("has_tag", expected), self.has_no_owner]),
+ )
+
+ def test_tags_query_filters(self):
+ # tests contains touples of query_parameter dics and the expected whoosh query
+ param = "tags"
+ field, _ = DelayedQuery.param_map[param]
+ tests = (
+ (
+ {f"{param}__id__all": "42,43"},
+ query.And(
+ [
+ query.Term(f"{field}_id", "42"),
+ query.Term(f"{field}_id", "43"),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ # tags does not allow __id
+ (
+ {f"{param}__id": "42"},
+ self.has_no_owner,
+ ),
+ # tags does not allow __isnull
+ (
+ {f"{param}__isnull": "true"},
+ self.has_no_owner,
+ ),
+ self._get_testset__id__in(param, field),
+ self._get_testset__id__none(param, field),
+ )
+
+ for params, expected in tests:
+ dq = DelayedQuery(None, params, None, None)
+ got = dq._get_query_filter()
+ self.assertCountEqual(got, expected)
+
+ def test_generic_query_filters(self):
+ def _get_testset(param: str):
+ field, _ = DelayedQuery.param_map[param]
+ return (
+ (
+ {f"{param}__id": "42"},
+ query.And(
+ [
+ query.Term(f"{field}_id", "42"),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ self._get_testset__id__in(param, field),
+ self._get_testset__id__none(param, field),
+ (
+ {f"{param}__isnull": "true"},
+ query.And(
+ [
+ query.Term(f"has_{field}", False),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ (
+ {f"{param}__isnull": "false"},
+ query.And(
+ [
+ query.Term(f"has_{field}", True),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ )
+
+ query_params = ["correspondent", "document_type", "storage_path", "owner"]
+ for param in query_params:
+ for params, expected in _get_testset(param):
+ dq = DelayedQuery(None, params, None, None)
+ got = dq._get_query_filter()
+ self.assertCountEqual(got, expected)
+
+ def test_char_query_filter(self):
+ def _get_testset(param: str):
+ return (
+ (
+ {f"{param}__icontains": "foo"},
+ query.And(
+ [
+ query.Term(f"{param}", "foo"),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ (
+ {f"{param}__istartswith": "foo"},
+ query.And(
+ [
+ query.Prefix(f"{param}", "foo"),
+ self.has_no_owner,
+ ],
+ ),
+ ),
+ )
+
+ query_params = ["checksum", "original_filename"]
+ for param in query_params:
+ for params, expected in _get_testset(param):
+ dq = DelayedQuery(None, params, None, None)
+ got = dq._get_query_filter()
+ self.assertCountEqual(got, expected)
From 304324ebd042dd4fdd0d7389afc6c094992dfbff Mon Sep 17 00:00:00 2001
From: shamoon <4887959+shamoon@users.noreply.github.com>
Date: Sun, 4 Jun 2023 10:41:45 -0700
Subject: [PATCH 04/23] Update index.py
---
src/documents/index.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/documents/index.py b/src/documents/index.py
index 9a7505da3..34e0fd14b 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -207,7 +207,7 @@ class DelayedQuery:
try:
field, supported_query_filters = self.param_map[param]
except KeyError:
- logger.error("Unable to build a query filter for parameter %s", key)
+ logger.error(f"Unable to build a query filter for parameter {key}")
continue
# We only support certain filters per parameter
From 6e65558ea41c10509f027438cb9e42e1e395aa45 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:32:58 -0700
Subject: [PATCH 05/23] Swapping out the tika and replaces requests with httpx
---
Pipfile | 4 +-
Pipfile.lock | 237 ++++++++++++++++++------------
src/paperless_mail/parsers.py | 8 +-
src/paperless_tika/parsers.py | 4 +-
src/paperless_tika/tests/utils.py | 11 ++
5 files changed, 166 insertions(+), 98 deletions(-)
create mode 100644 src/paperless_tika/tests/utils.py
diff --git a/Pipfile b/Pipfile
index 49cdc32c2..edb0e46a9 100644
--- a/Pipfile
+++ b/Pipfile
@@ -37,14 +37,13 @@ psycopg2 = "*"
rapidfuzz = "*"
redis = {extras = ["hiredis"], version = "*"}
scikit-learn = "~=1.2"
-numpy = "*"
whitenoise = "~=6.3"
watchdog = "~=2.2"
whoosh="~=2.7"
inotifyrecursive = "~=0.3"
ocrmypdf = "~=14.0"
tqdm = "*"
-tika = "*"
+tika-client = "*"
channels = "~=4.0"
channels-redis = "*"
uvicorn = {extras = ["standard"], version = "*"}
@@ -78,6 +77,7 @@ factory-boy = "*"
pytest = "*"
pytest-cov = "*"
pytest-django = "*"
+pytest-httpx = "*"
pytest-env = "*"
pytest-sugar = "*"
pytest-xdist = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index c826846be..e92c913c4 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "271fd0b623bee180093e65238a9e1fc7bfaf9c292364d479f935da751958bfd4"
+ "sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
},
"pipfile-spec": 6,
"requires": {},
@@ -652,6 +652,14 @@
],
"version": "==2.2.3"
},
+ "httpcore": {
+ "hashes": [
+ "sha256:125f8375ab60036db632f34f4b627a9ad085048eef7cb7d2616fea0f739f98af",
+ "sha256:5581b9c12379c4288fe70f43c710d16060c10080617001e6b22a3b6dbcbefd36"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.17.2"
+ },
"httptools": {
"hashes": [
"sha256:0297822cea9f90a38df29f48e40b42ac3d48a28637368f3ec6d15eebefd182f9",
@@ -698,6 +706,14 @@
],
"version": "==0.5.0"
},
+ "httpx": {
+ "hashes": [
+ "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
+ "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.24.1"
+ },
"humanfriendly": {
"hashes": [
"sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477",
@@ -986,7 +1002,7 @@
"sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c",
"sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b"
],
- "index": "pypi",
+ "markers": "python_version >= '3.8'",
"version": "==1.24.3"
},
"ocrmypdf": {
@@ -1553,14 +1569,6 @@
"index": "pypi",
"version": "==3.6.12"
},
- "requests": {
- "hashes": [
- "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f",
- "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"
- ],
- "markers": "python_version >= '3.7'",
- "version": "==2.31.0"
- },
"scikit-learn": {
"hashes": [
"sha256:065e9673e24e0dc5113e2dd2b4ca30c9d8aa2fa90f4c0597241c93b63130d233",
@@ -1735,13 +1743,13 @@
"markers": "python_version >= '3.6'",
"version": "==3.1.0"
},
- "tika": {
+ "tika-client": {
"hashes": [
- "sha256:3b136ae517db6c69c5ddee3a6a5c98e8966fedfc7c9155ebaaf3b9269121f992",
- "sha256:56670eb812944eb25ed73f1b3b075aa41e7a135b74b240822f28b819e5b373da"
+ "sha256:6f2afab12eb46cd7b4ed6c34c9c2a1791a45d2f479c0da0076936dc6dbfe8061",
+ "sha256:f2c23cb76677b7b8be70e2d95ac3418ed046b1514bff920f7460beae1ca3342b"
],
"index": "pypi",
- "version": "==2.6.0"
+ "version": "==0.0.2"
},
"tornado": {
"hashes": [
@@ -1784,14 +1792,6 @@
"markers": "python_version >= '3.7'",
"version": "==5.0.1"
},
- "urllib3": {
- "hashes": [
- "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc",
- "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"
- ],
- "markers": "python_version >= '3.7'",
- "version": "==2.0.2"
- },
"uvicorn": {
"extras": [
"standard"
@@ -2095,6 +2095,14 @@
}
},
"develop": {
+ "anyio": {
+ "hashes": [
+ "sha256:275d9973793619a5374e1c89a4f4ad3f4b0a5510a2b5b939444bee8f4c4d37ce",
+ "sha256:eddca883c4175f14df8aedce21054bfca3adb70ffe76a9f607aef9d7fa2ea7f0"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==3.7.0"
+ },
"asgiref": {
"hashes": [
"sha256:89b2ef2247e3b562a16eef663bc0e2e703ec6468e2fa8a5cd61cd449786d4f6e",
@@ -2350,60 +2358,69 @@
"toml"
],
"hashes": [
- "sha256:004948e296149644d208964300cb3d98affc5211e9e490e9979af4030b0d6473",
- "sha256:13cde6bb0e58fb67d09e2f373de3899d1d1e866c5a9ff05d93615f2f54fbd2bb",
- "sha256:1c9e4a5eb1bbc3675ee57bc31f8eea4cd7fb0cbcbe4912cf1cb2bf3b754f4a80",
- "sha256:2025f913f2edb0272ef15d00b1f335ff8908c921c8eb2013536fcaf61f5a683d",
- "sha256:25bad4196104761bc26b1dae9b57383826542ec689ff0042f7f4f4dd7a815cba",
- "sha256:2692306d3d4cb32d2cceed1e47cebd6b1d2565c993d6d2eda8e6e6adf53301e6",
- "sha256:272ab31228a9df857ab5df5d67936d8861464dc89c5d3fab35132626e9369379",
- "sha256:2e8c0e79820cdd67978e1120983786422d279e07a381dbf89d03bbb23ec670a6",
- "sha256:3062fd5c62df988cea9f2972c593f77fed1182bfddc5a3b12b1e606cb7aba99e",
- "sha256:3436927d1794fa6763b89b60c896f9e3bd53212001026ebc9080d23f0c2733c1",
- "sha256:35db06450272473eab4449e9c2ad9bc6a0a68dab8e81a0eae6b50d9c2838767e",
- "sha256:392154d09bd4473b9d11351ab5d63391f3d5d24d752f27b3be7498b0ee2b5226",
- "sha256:3cff6980fe7100242170092bb40d2b1cdad79502cd532fd26b12a2b8a5f9aee0",
- "sha256:42c692b55a647a832025a4c048007034fe77b162b566ad537ce65ad824b12a84",
- "sha256:44c9b9f1a245f3d0d202b1a8fa666a80b5ecbe4ad5d0859c0fb16a52d9763224",
- "sha256:496b86f1fc9c81a1cd53d8842ef712e950a4611bba0c42d33366a7b91ba969ec",
- "sha256:4bbd58eb5a2371bf160590f4262109f66b6043b0b991930693134cb617bc0169",
- "sha256:4e3783a286d5a93a2921396d50ce45a909aa8f13eee964465012f110f0cbb611",
- "sha256:4f3c7c19581d471af0e9cb49d928172cd8492cd78a2b7a4e82345d33662929bb",
- "sha256:52c139b7ab3f0b15f9aad0a3fedef5a1f8c0b2bdc291d88639ca2c97d3682416",
- "sha256:541280dde49ce74a4262c5e395b48ea1207e78454788887118c421cb4ffbfcac",
- "sha256:5906f6a84b47f995cd1bf0aca1c72d591c55ee955f98074e93660d64dfc66eb9",
- "sha256:6284a2005e4f8061c58c814b1600ad0074ccb0289fe61ea709655c5969877b70",
- "sha256:6727a0d929ff0028b1ed8b3e7f8701670b1d7032f219110b55476bb60c390bfb",
- "sha256:697f4742aa3f26c107ddcb2b1784a74fe40180014edbd9adaa574eac0529914c",
- "sha256:6b9f64526286255735847aed0221b189486e0b9ed943446936e41b7e44b08783",
- "sha256:6babcbf1e66e46052442f10833cfc4a0d3554d8276aa37af8531a83ed3c1a01d",
- "sha256:6e7f1a8328eeec34c54f1d5968a708b50fc38d31e62ca8b0560e84a968fbf9a9",
- "sha256:71f739f97f5f80627f1fee2331e63261355fd1e9a9cce0016394b6707ac3f4ec",
- "sha256:76d06b721c2550c01a60e5d3093f417168658fb454e5dfd9a23570e9bffe39a1",
- "sha256:77a04b84d01f0e12c66f16e69e92616442dc675bbe51b90bfb074b1e5d1c7fbd",
- "sha256:97729e6828643f168a2a3f07848e1b1b94a366b13a9f5aba5484c2215724edc8",
- "sha256:9a8723ccec4e564d4b9a79923246f7b9a8de4ec55fa03ec4ec804459dade3c4f",
- "sha256:a5ffd45c6b93c23a8507e2f436983015c6457aa832496b6a095505ca2f63e8f1",
- "sha256:a6f03f87fea579d55e0b690d28f5042ec1368650466520fbc400e7aeaf09e995",
- "sha256:aac1d5fdc5378f6bac2c0c7ebe7635a6809f5b4376f6cf5d43243c1917a67087",
- "sha256:ae82c5f168d2a39a5d69a12a69d4dc23837a43cf2ca99be60dfe59996ea6b113",
- "sha256:bc7b667f8654376e9353dd93e55e12ce2a59fb6d8e29fce40de682273425e044",
- "sha256:c1d7a31603c3483ac49c1726723b0934f88f2c011c660e6471e7bd735c2fa110",
- "sha256:c534431153caffc7c495c3eddf7e6a6033e7f81d78385b4e41611b51e8870446",
- "sha256:c93d52c3dc7b9c65e39473704988602300e3cc1bad08b5ab5b03ca98bbbc68c1",
- "sha256:cbcc874f454ee51f158afd604a315f30c0e31dff1d5d5bf499fc529229d964dd",
- "sha256:d3cacc6a665221108ecdf90517a8028d07a2783df3417d12dcfef1c517e67478",
- "sha256:d712cefff15c712329113b01088ba71bbcef0f7ea58478ca0bbec63a824844cb",
- "sha256:d7786b2fa7809bf835f830779ad285215a04da76293164bb6745796873f0942d",
- "sha256:dc11b42fa61ff1e788dd095726a0aed6aad9c03d5c5984b54cb9e1e67b276aa5",
- "sha256:dc4d5187ef4d53e0d4c8eaf530233685667844c5fb0b855fea71ae659017854b",
- "sha256:f5440cdaf3099e7ab17a5a7065aed59aff8c8b079597b61c1f8be6f32fe60636",
- "sha256:fa079995432037b5e2ef5ddbb270bcd2ded9f52b8e191a5de11fe59a00ea30d8",
- "sha256:fbe6e8c0a9a7193ba10ee52977d4d5e7652957c1f56ccefed0701db8801a2a3b",
- "sha256:fde5c7a9d9864d3e07992f66767a9817f24324f354caa3d8129735a3dc74f126"
+ "sha256:06a9a2be0b5b576c3f18f1a241f0473575c4a26021b52b2a85263a00f034d51f",
+ "sha256:06fb182e69f33f6cd1d39a6c597294cff3143554b64b9825d1dc69d18cc2fff2",
+ "sha256:0a5f9e1dbd7fbe30196578ca36f3fba75376fb99888c395c5880b355e2875f8a",
+ "sha256:0e1f928eaf5469c11e886fe0885ad2bf1ec606434e79842a879277895a50942a",
+ "sha256:171717c7cb6b453aebac9a2ef603699da237f341b38eebfee9be75d27dc38e01",
+ "sha256:1e9d683426464e4a252bf70c3498756055016f99ddaec3774bf368e76bbe02b6",
+ "sha256:201e7389591af40950a6480bd9edfa8ed04346ff80002cec1a66cac4549c1ad7",
+ "sha256:245167dd26180ab4c91d5e1496a30be4cd721a5cf2abf52974f965f10f11419f",
+ "sha256:2aee274c46590717f38ae5e4650988d1af340fe06167546cc32fe2f58ed05b02",
+ "sha256:2e07b54284e381531c87f785f613b833569c14ecacdcb85d56b25c4622c16c3c",
+ "sha256:31563e97dae5598556600466ad9beea39fb04e0229e61c12eaa206e0aa202063",
+ "sha256:33d6d3ea29d5b3a1a632b3c4e4f4ecae24ef170b0b9ee493883f2df10039959a",
+ "sha256:3d376df58cc111dc8e21e3b6e24606b5bb5dee6024f46a5abca99124b2229ef5",
+ "sha256:419bfd2caae268623dd469eff96d510a920c90928b60f2073d79f8fe2bbc5959",
+ "sha256:48c19d2159d433ccc99e729ceae7d5293fbffa0bdb94952d3579983d1c8c9d97",
+ "sha256:49969a9f7ffa086d973d91cec8d2e31080436ef0fb4a359cae927e742abfaaa6",
+ "sha256:52edc1a60c0d34afa421c9c37078817b2e67a392cab17d97283b64c5833f427f",
+ "sha256:537891ae8ce59ef63d0123f7ac9e2ae0fc8b72c7ccbe5296fec45fd68967b6c9",
+ "sha256:54b896376ab563bd38453cecb813c295cf347cf5906e8b41d340b0321a5433e5",
+ "sha256:58c2ccc2f00ecb51253cbe5d8d7122a34590fac9646a960d1430d5b15321d95f",
+ "sha256:5b7540161790b2f28143191f5f8ec02fb132660ff175b7747b95dcb77ac26562",
+ "sha256:5baa06420f837184130752b7c5ea0808762083bf3487b5038d68b012e5937dbe",
+ "sha256:5e330fc79bd7207e46c7d7fd2bb4af2963f5f635703925543a70b99574b0fea9",
+ "sha256:61b9a528fb348373c433e8966535074b802c7a5d7f23c4f421e6c6e2f1697a6f",
+ "sha256:63426706118b7f5cf6bb6c895dc215d8a418d5952544042c8a2d9fe87fcf09cb",
+ "sha256:6d040ef7c9859bb11dfeb056ff5b3872436e3b5e401817d87a31e1750b9ae2fb",
+ "sha256:6f48351d66575f535669306aa7d6d6f71bc43372473b54a832222803eb956fd1",
+ "sha256:7ee7d9d4822c8acc74a5e26c50604dff824710bc8de424904c0982e25c39c6cb",
+ "sha256:81c13a1fc7468c40f13420732805a4c38a105d89848b7c10af65a90beff25250",
+ "sha256:8d13c64ee2d33eccf7437961b6ea7ad8673e2be040b4f7fd4fd4d4d28d9ccb1e",
+ "sha256:8de8bb0e5ad103888d65abef8bca41ab93721647590a3f740100cd65c3b00511",
+ "sha256:8fa03bce9bfbeeef9f3b160a8bed39a221d82308b4152b27d82d8daa7041fee5",
+ "sha256:924d94291ca674905fe9481f12294eb11f2d3d3fd1adb20314ba89e94f44ed59",
+ "sha256:975d70ab7e3c80a3fe86001d8751f6778905ec723f5b110aed1e450da9d4b7f2",
+ "sha256:976b9c42fb2a43ebf304fa7d4a310e5f16cc99992f33eced91ef6f908bd8f33d",
+ "sha256:9e31cb64d7de6b6f09702bb27c02d1904b3aebfca610c12772452c4e6c21a0d3",
+ "sha256:a342242fe22407f3c17f4b499276a02b01e80f861f1682ad1d95b04018e0c0d4",
+ "sha256:a3d33a6b3eae87ceaefa91ffdc130b5e8536182cd6dfdbfc1aa56b46ff8c86de",
+ "sha256:a895fcc7b15c3fc72beb43cdcbdf0ddb7d2ebc959edac9cef390b0d14f39f8a9",
+ "sha256:afb17f84d56068a7c29f5fa37bfd38d5aba69e3304af08ee94da8ed5b0865833",
+ "sha256:b1c546aca0ca4d028901d825015dc8e4d56aac4b541877690eb76490f1dc8ed0",
+ "sha256:b29019c76039dc3c0fd815c41392a044ce555d9bcdd38b0fb60fb4cd8e475ba9",
+ "sha256:b46517c02ccd08092f4fa99f24c3b83d8f92f739b4657b0f146246a0ca6a831d",
+ "sha256:b7aa5f8a41217360e600da646004f878250a0d6738bcdc11a0a39928d7dc2050",
+ "sha256:b7b4c971f05e6ae490fef852c218b0e79d4e52f79ef0c8475566584a8fb3e01d",
+ "sha256:ba90a9563ba44a72fda2e85302c3abc71c5589cea608ca16c22b9804262aaeb6",
+ "sha256:cb017fd1b2603ef59e374ba2063f593abe0fc45f2ad9abdde5b4d83bd922a353",
+ "sha256:d22656368f0e6189e24722214ed8d66b8022db19d182927b9a248a2a8a2f67eb",
+ "sha256:d2c2db7fd82e9b72937969bceac4d6ca89660db0a0967614ce2481e81a0b771e",
+ "sha256:d39b5b4f2a66ccae8b7263ac3c8170994b65266797fb96cbbfd3fb5b23921db8",
+ "sha256:d62a5c7dad11015c66fbb9d881bc4caa5b12f16292f857842d9d1871595f4495",
+ "sha256:e7d9405291c6928619403db1d10bd07888888ec1abcbd9748fdaa971d7d661b2",
+ "sha256:e84606b74eb7de6ff581a7915e2dab7a28a0517fbe1c9239eb227e1354064dcd",
+ "sha256:eb393e5ebc85245347950143969b241d08b52b88a3dc39479822e073a1a8eb27",
+ "sha256:ebba1cd308ef115925421d3e6a586e655ca5a77b5bf41e02eb0e4562a111f2d1",
+ "sha256:ee57190f24fba796e36bb6d3aa8a8783c643d8fa9760c89f7a98ab5455fbf818",
+ "sha256:f2f67fe12b22cd130d34d0ef79206061bfb5eda52feb6ce0dba0644e20a03cf4",
+ "sha256:f6951407391b639504e3b3be51b7ba5f3528adbf1a8ac3302b687ecababf929e",
+ "sha256:f75f7168ab25dd93110c8a8117a22450c19976afbc44234cbf71481094c1b850",
+ "sha256:fdec9e8cbf13a5bf63290fc6013d216a4c7232efb51548594ca3631a7f13c3a3"
],
"markers": "python_version >= '3.7'",
- "version": "==7.2.6"
+ "version": "==7.2.7"
},
"cryptography": {
"hashes": [
@@ -2471,11 +2488,11 @@
},
"faker": {
"hashes": [
- "sha256:80a5ea1464556c06b98bf47ea3adc7f33811a1182518d847860b1874080bd3c9",
- "sha256:defe9ed618a67ebf0f3eb1895e198c2355a7128a09087a6dce342ef2253263ea"
+ "sha256:a70de9ec7a14a02d278755a11134baa5a297bb82600f115022d0d07080a9e77a",
+ "sha256:dd15fa165ced55f668fbb0ad20ece98ab78ddacd58dc056950d66980ff61fa79"
],
"markers": "python_version >= '3.7'",
- "version": "==18.9.0"
+ "version": "==18.10.0"
},
"filelock": {
"hashes": [
@@ -2492,6 +2509,30 @@
],
"version": "==2.1.0"
},
+ "h11": {
+ "hashes": [
+ "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d",
+ "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.14.0"
+ },
+ "httpcore": {
+ "hashes": [
+ "sha256:125f8375ab60036db632f34f4b627a9ad085048eef7cb7d2616fea0f739f98af",
+ "sha256:5581b9c12379c4288fe70f43c710d16060c10080617001e6b22a3b6dbcbefd36"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.17.2"
+ },
+ "httpx": {
+ "hashes": [
+ "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
+ "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==0.24.1"
+ },
"hyperlink": {
"hashes": [
"sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b",
@@ -2636,11 +2677,11 @@
},
"mkdocs-material": {
"hashes": [
- "sha256:1ae74cc5464ef2f64574d4884512efed7f4db386fb9bc6af20fd427d7a702f49",
- "sha256:b56a9f955ed32d38333715cbbf68ce38f683bf38610c65094fa4ef2db9f08bcd"
+ "sha256:8513ab847c9a541ed3d11a3a7eed556caf72991ee786c31c5aac6691a121088a",
+ "sha256:b49e12869ab464558e2dd3c5792da5b748a7e0c48ee83b4d05715f98125a7a39"
],
"index": "pypi",
- "version": "==9.1.14"
+ "version": "==9.1.15"
},
"mkdocs-material-extensions": {
"hashes": [
@@ -2697,7 +2738,7 @@
"sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c",
"sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b"
],
- "index": "pypi",
+ "markers": "python_version >= '3.8'",
"version": "==1.24.3"
},
"packaging": {
@@ -2898,6 +2939,14 @@
"index": "pypi",
"version": "==0.8.1"
},
+ "pytest-httpx": {
+ "hashes": [
+ "sha256:3a82797f3a9a14d51e8c6b7fa97524b68b847ee801109c062e696b4744f4431c",
+ "sha256:cefb7dcf66a4cb0601b0de05e576cca423b6081f3245e7912a4d84c58fa3eae8"
+ ],
+ "index": "pypi",
+ "version": "==0.22.0"
+ },
"pytest-sugar": {
"hashes": [
"sha256:8cb5a4e5f8bbcd834622b0235db9e50432f4cbd71fef55b467fe44e43701e062",
@@ -3183,6 +3232,14 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.16.0"
},
+ "sniffio": {
+ "hashes": [
+ "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101",
+ "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"
+ ],
+ "markers": "python_version >= '3.7'",
+ "version": "==1.3.0"
+ },
"termcolor": {
"hashes": [
"sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475",
@@ -3354,11 +3411,11 @@
},
"celery-types": {
"hashes": [
- "sha256:1bcdd44614248cca1ac962017cf76392e16a86e2f9a0404e683063fe4e6a10b1",
- "sha256:806a5a62aeeebc59f20d96fb075b1c1fe28a31bfc6cb57ae9b4e2e4cceb38f88"
+ "sha256:324f52a936d36636236c8caca48f4dddb2d5077971d04275ac0959018a9d3d5e",
+ "sha256:c130770514e68069363ca3b27759bb9d34bd7e99fcfa7ad2469588f9f55478b4"
],
"index": "pypi",
- "version": "==0.15.0"
+ "version": "==0.17.0"
},
"certifi": {
"hashes": [
@@ -3749,10 +3806,10 @@
},
"types-pyopenssl": {
"hashes": [
- "sha256:ad024b07a1f4bffbca44699543c71efd04733a6c22781fa9673a971e410a3086",
- "sha256:e7211088eff3e20d359888dedecb0994f7181d5cce0f26354dd47ca0484dc8a6"
+ "sha256:43e307e8dfb3a7a8208a19874ca060305f460c529d4eaca8a2669ea89499f244",
+ "sha256:ba803a99440b0c2e9ab4e197084aeefc55bdfe8a580d367b2aa4210810a21240"
],
- "version": "==23.1.0.3"
+ "version": "==23.2.0.0"
},
"types-python-dateutil": {
"hashes": [
@@ -3786,10 +3843,10 @@
},
"types-requests": {
"hashes": [
- "sha256:7c5cea7940f8e92ec560bbc468f65bf684aa3dcf0554a6f8c4710f5f708dc598",
- "sha256:c1c29d20ab8d84dff468d7febfe8e0cb0b4664543221b386605e14672b44ea25"
+ "sha256:3de667cffa123ce698591de0ad7db034a5317457a596eb0b4944e5a9d9e8d1ac",
+ "sha256:afb06ef8f25ba83d59a1d424bd7a5a939082f94b94e90ab5e6116bd2559deaa3"
],
- "version": "==2.31.0.0"
+ "version": "==2.31.0.1"
},
"types-setuptools": {
"hashes": [
diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py
index ded4d6a9b..7cd5e06e6 100644
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -4,7 +4,7 @@ from html import escape
from io import BytesIO
from io import StringIO
-import requests
+import httpx
from bleach import clean
from bleach import linkify
from django.conf import settings
@@ -185,7 +185,7 @@ class MailDocumentParser(DocumentParser):
files[name] = (name, BytesIO(content))
headers = {}
try:
- response = requests.post(url_merge, files=files, headers=headers)
+ response = httpx.post(url_merge, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}") from err
@@ -280,7 +280,7 @@ class MailDocumentParser(DocumentParser):
data["pdfFormat"] = "PDF/A-3b"
try:
- response = requests.post(
+ response = httpx.post(
url,
files=files,
headers=headers,
@@ -336,7 +336,7 @@ class MailDocumentParser(DocumentParser):
"scale": "1.0",
}
try:
- response = requests.post(
+ response = httpx.post(
url,
files=files,
headers=headers,
diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py
index d71f038a3..345dc06e5 100644
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -2,7 +2,7 @@ import os
from pathlib import Path
import dateutil.parser
-import requests
+import httpx
from django.conf import settings
from tika import parser
@@ -106,7 +106,7 @@ class TikaDocumentParser(DocumentParser):
data["pdfFormat"] = "PDF/A-3b"
try:
- response = requests.post(url, files=files, headers=headers, data=data)
+ response = httpx.post(url, files=files, headers=headers, data=data)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
diff --git a/src/paperless_tika/tests/utils.py b/src/paperless_tika/tests/utils.py
new file mode 100644
index 000000000..b26f79ec6
--- /dev/null
+++ b/src/paperless_tika/tests/utils.py
@@ -0,0 +1,11 @@
+import pytest
+from pytest_httpx import HTTPXMock
+
+
+class HttpxMockMixin:
+ @pytest.fixture(autouse=True)
+ def httpx_mock_auto(self, httpx_mock: HTTPXMock):
+ """
+ Workaround for allowing use of a fixture with unittest style testing
+ """
+ self.httpx_mock = httpx_mock
From 2c1cd25be4a54670161450bc1696f361a651631d Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Thu, 1 Jun 2023 14:50:08 -0700
Subject: [PATCH 06/23] Rewrites the email parsing to be more clear and
concise.
Adds testing to use httpx mocked responses to stand in as a server even offline
---
.github/workflows/ci.yml | 15 +-
Pipfile.lock | 6 +-
src/documents/tests/utils.py | 14 +
src/paperless_mail/parsers.py | 327 +++++----
src/paperless_mail/tests/test_parsers.py | 663 +++++++++---------
src/paperless_mail/tests/test_parsers_live.py | 355 +++-------
src/paperless_tika/parsers.py | 54 +-
src/paperless_tika/tests/test_live_tika.py | 7 +-
src/paperless_tika/tests/test_tika_parser.py | 83 ++-
9 files changed, 701 insertions(+), 823 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e8c9bb533..9d2c510ca 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -106,15 +106,6 @@ jobs:
matrix:
python-version: ['3.8', '3.9', '3.10']
fail-fast: false
- env:
- # Enable Tika end to end testing
- TIKA_LIVE: 1
- # Enable paperless_mail testing against real server
- PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
- PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
- PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
- # Enable Gotenberg end to end testing
- GOTENBERG_LIVE: 1
steps:
-
name: Checkout
@@ -156,6 +147,12 @@ jobs:
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pip list
-
name: Tests
+ env:
+ PAPERLESS_CI_TEST: 1
+ # Enable paperless_mail testing against real server
+ PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
+ PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
+ PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
run: |
cd src/
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra
diff --git a/Pipfile.lock b/Pipfile.lock
index e92c913c4..d9e6b8d56 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1745,11 +1745,11 @@
},
"tika-client": {
"hashes": [
- "sha256:6f2afab12eb46cd7b4ed6c34c9c2a1791a45d2f479c0da0076936dc6dbfe8061",
- "sha256:f2c23cb76677b7b8be70e2d95ac3418ed046b1514bff920f7460beae1ca3342b"
+ "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156",
+ "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610"
],
"index": "pypi",
- "version": "==0.0.2"
+ "version": "==0.0.3"
},
"tornado": {
"hashes": [
diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py
index fbde3345c..483d3b12d 100644
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -105,6 +105,20 @@ class FileSystemAssertsMixin:
def assertIsNotDir(self, path: Union[PathLike, str]):
self.assertFalse(Path(path).resolve().is_dir(), f"Dir does exist: {path}")
+ def assertFilesEqual(
+ self,
+ path1: Union[PathLike, str],
+ path2: Union[PathLike, str],
+ ):
+ path1 = Path(path1)
+ path2 = Path(path2)
+ import hashlib
+
+ hash1 = hashlib.sha256(path1.read_bytes()).hexdigest()
+ hash2 = hashlib.sha256(path2.read_bytes()).hexdigest()
+
+ self.assertEqual(hash1, hash2, "File SHA256 mismatch")
+
class ConsumerProgressMixin:
def setUp(self) -> None:
diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py
index 7cd5e06e6..3ec3e64a0 100644
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -1,8 +1,7 @@
-import os
import re
from html import escape
-from io import BytesIO
-from io import StringIO
+from pathlib import Path
+from typing import List
import httpx
from bleach import clean
@@ -11,8 +10,9 @@ from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from humanfriendly import format_size
+from imap_tools import MailAttachment
from imap_tools import MailMessage
-from tika import parser
+from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -22,33 +22,15 @@ from documents.parsers import make_thumbnail_from_pdf
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
- gotenbergs and sends the html part to a local tika server for text extraction.
+ Gotenberg and sends the html part to a Tika server for text extraction.
"""
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
logging_name = "paperless.parsing.mail"
- _parsed = None
- def get_parsed(self, document_path) -> MailMessage:
- if not self._parsed:
- try:
- with open(document_path, "rb") as eml:
- self._parsed = MailMessage.from_bytes(eml.read())
- except Exception as err:
- raise ParseError(
- f"Could not parse {document_path}: {err}",
- ) from err
- if not self._parsed.from_values:
- self._parsed = None
- raise ParseError(
- f"Could not parse {document_path}: Missing 'from'",
- )
-
- return self._parsed
-
- def get_thumbnail(self, document_path, mime_type, file_name=None):
+ def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(document_path)
@@ -58,11 +40,11 @@ class MailDocumentParser(DocumentParser):
self.logging_group,
)
- def extract_metadata(self, document_path, mime_type):
+ def extract_metadata(self, document_path: Path, mime_type: str):
result = []
try:
- mail = self.get_parsed(document_path)
+ mail = self.parse_file_to_message(document_path)
except ParseError as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
@@ -106,101 +88,157 @@ class MailDocumentParser(DocumentParser):
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
- def parse(self, document_path, mime_type, file_name=None):
+ def parse(self, document_path: Path, mime_type: str, file_name=None):
+ """
+ Parses the given .eml into formatted text, based on the decoded email.
+
+ """
+
def strip_text(text: str):
+ """
+ Reduces the spacing of the given text string
+ """
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
- mail = self.get_parsed(document_path)
+ def build_formatted_text(mail_message: MailMessage) -> str:
+ """
+ Constructs a formatted string, based on the given email. Basically tries
+ to get most of the email content, included front matter, into a nice string
+ """
+ fmt_text = f"Subject: {mail_message.subject}\n\n"
+ fmt_text += f"From: {mail_message.from_values.full}\n\n"
+ to_list = [address.full for address in mail_message.to_values]
+ fmt_text += f"To: {', '.join(to_list)}\n\n"
+ if mail_message.cc_values:
+ fmt_text += (
+ f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
+ )
+ if mail_message.bcc_values:
+ fmt_text += (
+ f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
+ )
+ if mail_message.attachments:
+ att = []
+ for a in mail.attachments:
+ att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
+ fmt_text += f"Attachments: {', '.join(att)}\n\n"
- self.text = f"Subject: {mail.subject}\n\n"
- self.text += f"From: {mail.from_values.full}\n\n"
- self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n"
- if len(mail.cc_values) >= 1:
- self.text += (
- f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
- )
- if len(mail.bcc_values) >= 1:
- self.text += (
- f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
- )
- if len(mail.attachments) >= 1:
- att = []
- for a in mail.attachments:
- att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
+ if mail.html:
+ fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
- self.text += f"Attachments: {', '.join(att)}\n\n"
+ fmt_text += f"\n\n{strip_text(mail.text)}"
- if mail.html:
- self.text += "HTML content: " + strip_text(self.tika_parse(mail.html))
+ return fmt_text
- self.text += f"\n\n{strip_text(mail.text)}"
+ self.log.debug(f"Parsing file {document_path.name} into an email")
+ mail = self.parse_file_to_message(document_path)
+
+ self.log.debug("Building formatted text from email")
+ self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
- self.archive_path = self.generate_pdf(document_path)
+ self.log.debug("Creating a PDF from the email")
+ self.archive_path = self.generate_pdf(mail)
+
+ @staticmethod
+ def parse_file_to_message(filepath: Path) -> MailMessage:
+ """
+ Parses the given .eml file into a MailMessage object
+ """
+ try:
+ with filepath.open("rb") as eml:
+ parsed = MailMessage.from_bytes(eml.read())
+ if parsed.from_values is None:
+ raise ParseError(
+ f"Could not parse {filepath}: Missing 'from'",
+ )
+ except Exception as err:
+ raise ParseError(
+ f"Could not parse {filepath}: {err}",
+ ) from err
+
+ return parsed
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
try:
- parsed = parser.from_buffer(html, self.tika_server)
+ with TikaClient(tika_url=self.tika_server) as client:
+ parsed = client.tika.as_text.from_buffer(html, "text/html")
+
+ if "X-TIKA:content" in parsed.data:
+ return parsed.data["X-TIKA:content"].strip()
+ return ""
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at "
f"{self.tika_server}: {err}",
) from err
- if parsed["content"]:
- return parsed["content"]
+
+ def generate_pdf(self, mail_message: MailMessage) -> Path:
+ archive_path = Path(self.tempdir) / "merged.pdf"
+
+ mail_pdf_file = self.generate_pdf_from_mail(mail_message)
+
+ # If no HTML content, create the PDF from the message
+ # Otherwise, create 2 PDFs and merge them with Gotenberg
+ if not mail_message.html:
+ archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
- return ""
+ url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
- def generate_pdf(self, document_path):
- pdf_collection = []
- url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
- pdf_path = os.path.join(self.tempdir, "merged.pdf")
- mail = self.get_parsed(document_path)
-
- pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail)))
-
- if not mail.html:
- with open(pdf_path, "wb") as file:
- file.write(pdf_collection[0][1])
- file.close()
- return pdf_path
- else:
- pdf_collection.append(
- (
- "2_html.pdf",
- self.generate_pdf_from_html(mail.html, mail.attachments),
- ),
+ pdf_of_html_content = self.generate_pdf_from_html(
+ mail_message.html,
+ mail_message.attachments,
)
- files = {}
- for name, content in pdf_collection:
- files[name] = (name, BytesIO(content))
- headers = {}
- try:
- response = httpx.post(url_merge, files=files, headers=headers)
- response.raise_for_status() # ensure we notice bad responses
- except Exception as err:
- raise ParseError(f"Error while converting document to PDF: {err}") from err
+ pdf_collection = {
+ "1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
+ "2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
+ }
- with open(pdf_path, "wb") as file:
- file.write(response.content)
- file.close()
+ try:
+ # Open a handle to each file, replacing the tuple
+ for filename in pdf_collection:
+ file_multi_part = pdf_collection[filename]
+ pdf_collection[filename] = (
+ file_multi_part[0],
+ file_multi_part[1].open("rb"),
+ file_multi_part[2],
+ )
- return pdf_path
+ response = httpx.post(url_merge, files=pdf_collection)
+ response.raise_for_status() # ensure we notice bad responses
- @staticmethod
- def mail_to_html(mail: MailMessage) -> StringIO:
- data = {}
+ archive_path.write_bytes(response.content)
- def clean_html(text: str):
+ except Exception as err:
+ raise ParseError(
+ f"Error while merging email HTML into PDF: {err}",
+ ) from err
+ finally:
+ for filename in pdf_collection:
+ file_multi_part_handle = pdf_collection[filename][1]
+ file_multi_part_handle.close()
+
+ return archive_path
+
+ def mail_to_html(self, mail: MailMessage) -> Path:
+ """
+ Converts the given email into an HTML file, formatted
+ based on the given template
+ """
+
+ def clean_html(text: str) -> str:
+ """
+ Attempts to clean, escape and linkify the given HTML string
+ """
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if type(text) != str:
@@ -211,6 +249,8 @@ class MailDocumentParser(DocumentParser):
text = text.replace("\n", "
")
return text
+ data = {}
+
data["subject"] = clean_html(mail.subject)
if data["subject"]:
data["subject_label"] = "Subject"
@@ -237,27 +277,33 @@ class MailDocumentParser(DocumentParser):
data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M"))
data["content"] = clean_html(mail.text.strip())
- html = StringIO()
-
from django.template.loader import render_to_string
- rendered = render_to_string("email_msg_template.html", context=data)
+ html_file = Path(self.tempdir) / "email_as_html.html"
+ html_file.write_text(render_to_string("email_msg_template.html", context=data))
- html.write(rendered)
- html.seek(0)
+ return html_file
- return html
-
- def generate_pdf_from_mail(self, mail):
+ def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
+ """
+ Creates a PDF based on the given email, using the email's values in a
+ an HTML template
+ """
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting mail to PDF")
- css_file = os.path.join(os.path.dirname(__file__), "templates/output.css")
+ css_file = Path(__file__).parent / "templates" / "output.css"
+ email_html_file = self.mail_to_html(mail)
- with open(css_file, "rb") as css_handle:
+ print(css_file)
+ print(email_html_file)
+
+ with css_file.open("rb") as css_handle, email_html_file.open(
+ "rb",
+ ) as email_html_handle:
files = {
- "html": ("index.html", self.mail_to_html(mail)),
- "css": ("output.css", css_handle),
+ "html": ("index.html", email_html_handle, "text/html"),
+ "css": ("output.css", css_handle, "text/css"),
}
headers = {}
data = {
@@ -289,13 +335,23 @@ class MailDocumentParser(DocumentParser):
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
- f"Error while converting document to PDF: {err}",
+ f"Error while converting email to PDF: {err}",
) from err
- return response.content
+ email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
+ email_as_pdf_file.write_bytes(response.content)
+
+ return email_as_pdf_file
+
+ def generate_pdf_from_html(
+ self,
+ orig_html: str,
+ attachments: List[MailAttachment],
+ ) -> Path:
+ """
+ Generates a PDF file based on the HTML and attachments of the email
+ """
- @staticmethod
- def transform_inline_html(html, attachments):
def clean_html_script(text: str):
compiled_open = re.compile(re.escape("