Merge remote-tracking branch 'origin/dev'

This commit is contained in:
Trenton H 2023-06-15 08:54:03 -07:00
commit bfc271e743
36 changed files with 1941 additions and 1516 deletions

View File

@ -106,15 +106,6 @@ jobs:
matrix:
python-version: ['3.8', '3.9', '3.10']
fail-fast: false
env:
# Enable Tika end to end testing
TIKA_LIVE: 1
# Enable paperless_mail testing against real server
PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
# Enable Gotenberg end to end testing
GOTENBERG_LIVE: 1
steps:
-
name: Checkout
@ -156,12 +147,18 @@ jobs:
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pip list
-
name: Tests
env:
PAPERLESS_CI_TEST: 1
# Enable paperless_mail testing against real server
PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
run: |
cd src/
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra
-
name: Upload coverage to Codecov
if: ${{ matrix.python-version == env.DEFAULT_PYTHON_VERSION && github.event_name == 'push'}}
if: ${{ matrix.python-version == env.DEFAULT_PYTHON_VERSION }}
uses: codecov/codecov-action@v3
with:
# not required for public repos, but intermittently fails otherwise
@ -309,7 +306,7 @@ jobs:
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.docker-meta.outputs.tags }}
labels: ${{ steps.docker-meta.outputs.labels }}
# Get cache layers from this branch, then dev, then main
# Get cache layers from this branch, then dev
# This allows new branches to get at least some cache benefits, generally from dev
cache-from: |
type=registry,ref=ghcr.io/${{ steps.set-ghcr-repository.outputs.ghcr-repository }}/builder/cache/app:${{ github.ref_name }}

View File

@ -51,14 +51,6 @@ jobs:
include:
- primary-name: "paperless-ngx"
- primary-name: "paperless-ngx/builder/cache/app"
- primary-name: "paperless-ngx/builder/qpdf"
- primary-name: "paperless-ngx/builder/cache/qpdf"
- primary-name: "paperless-ngx/builder/pikepdf"
- primary-name: "paperless-ngx/builder/cache/pikepdf"
- primary-name: "paperless-ngx/builder/jbig2enc"
- primary-name: "paperless-ngx/builder/cache/jbig2enc"
- primary-name: "paperless-ngx/builder/psycopg2"
- primary-name: "paperless-ngx/builder/cache/psycopg2"
# TODO: Remove the above and replace with the below
# - primary-name: "builder/qpdf"
# - primary-name: "builder/cache/qpdf"

View File

@ -37,7 +37,7 @@ repos:
exclude: "(^Pipfile\\.lock$)"
# Python hooks
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.265'
rev: 'v0.0.272'
hooks:
- id: ruff
- repo: https://github.com/psf/black
@ -57,6 +57,6 @@ repos:
args:
- "--tab"
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: "v0.9.0.2"
rev: "v0.9.0.5"
hooks:
- id: shellcheck

View File

@ -5,7 +5,7 @@
# Purpose: Compiles the frontend
# Notes:
# - Does NPM stuff with Typescript and such
FROM --platform=$BUILDPLATFORM node:16-bullseye-slim AS compile-frontend
FROM --platform=$BUILDPLATFORM docker.io/node:16-bookworm-slim AS compile-frontend
COPY ./src-ui /src/src-ui
@ -21,7 +21,7 @@ RUN set -eux \
# Comments:
# - pipenv dependencies are not left in the final image
# - pipenv can't touch the final image somehow
FROM --platform=$BUILDPLATFORM python:3.9-alpine as pipenv-base
FROM --platform=$BUILDPLATFORM docker.io/python:3.9-alpine as pipenv-base
WORKDIR /usr/src/pipenv
@ -37,7 +37,7 @@ RUN set -eux \
# Purpose: The final image
# Comments:
# - Don't leave anything extra in here
FROM python:3.9-slim-bullseye as main-app
FROM docker.io/python:3.9-slim-bookworm as main-app
LABEL org.opencontainers.image.authors="paperless-ngx team <hello@paperless-ngx.com>"
LABEL org.opencontainers.image.documentation="https://docs.paperless-ngx.com/"
@ -70,9 +70,9 @@ ARG RUNTIME_PACKAGES="\
# Image processing
liblept5 \
liblcms2-2 \
libtiff5 \
libtiff6 \
libfreetype6 \
libwebp6 \
libwebp7 \
libopenjp2-7 \
libimagequant0 \
libraqm0 \
@ -98,6 +98,8 @@ ARG RUNTIME_PACKAGES="\
libxml2 \
libxslt1.1 \
libgnutls30 \
libqpdf29 \
qpdf \
# Mime type detection
file \
libmagic1 \
@ -181,7 +183,7 @@ ARG PSYCOPG2_VERSION=2.9.6
RUN set -eux \
&& echo "Getting binaries" \
&& mkdir paperless-ngx \
&& curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/builder/archive/3d6574e2dbaa8b8cdced864a256b0de59015f605.tar.gz \
&& curl --fail --silent --show-error --output paperless-ngx.tar.gz --location https://github.com/paperless-ngx/builder/archive/1f0e6665ba1b144f70fd6dfc8d0e8ba3b7a578ee.tar.gz \
&& tar -xf paperless-ngx.tar.gz --directory paperless-ngx --strip-components=1 \
&& cd paperless-ngx \
# Setting a specific revision ensures we know what this installed
@ -189,9 +191,7 @@ RUN set -eux \
&& echo "Installing jbig2enc" \
&& cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/jbig2 /usr/local/bin/ \
&& cp ./jbig2enc/${JBIG2ENC_VERSION}/${TARGETARCH}${TARGETVARIANT}/libjbig2enc* /usr/local/lib/ \
&& echo "Installing qpdf" \
&& apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/libqpdf29_*.deb \
&& apt-get install --yes --no-install-recommends ./qpdf/${QPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/qpdf_*.deb \
&& chmod a+x /usr/local/bin/jbig2 \
&& echo "Installing pikepdf and dependencies" \
&& python3 -m pip install --no-cache-dir ./pikepdf/${PIKEPDF_VERSION}/${TARGETARCH}${TARGETVARIANT}/*.whl \
&& python3 -m pip list \
@ -214,8 +214,7 @@ COPY --from=pipenv-base /usr/src/pipenv/requirements.txt ./
ARG BUILD_PACKAGES="\
build-essential \
git \
default-libmysqlclient-dev \
python3-dev"
default-libmysqlclient-dev"
RUN set -eux \
&& echo "Installing build system packages" \

View File

@ -37,14 +37,13 @@ psycopg2 = "*"
rapidfuzz = "*"
redis = {extras = ["hiredis"], version = "*"}
scikit-learn = "~=1.2"
numpy = "*"
whitenoise = "~=6.3"
watchdog = "~=2.2"
whoosh="~=2.7"
inotifyrecursive = "~=0.3"
ocrmypdf = "~=14.0"
tqdm = "*"
tika = "*"
tika-client = "*"
channels = "~=4.0"
channels-redis = "*"
uvicorn = {extras = ["standard"], version = "*"}
@ -78,6 +77,7 @@ factory-boy = "*"
pytest = "*"
pytest-cov = "*"
pytest-django = "*"
pytest-httpx = "*"
pytest-env = "*"
pytest-sugar = "*"
pytest-xdist = "*"

1080
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -80,7 +80,7 @@ django_checks() {
search_index() {
local -r index_version=5
local -r index_version=6
local -r index_version_file=${DATA_DIR}/.index_version
if [[ (! -f "${index_version_file}") || $(<"${index_version_file}") != "$index_version" ]]; then

View File

@ -28,7 +28,7 @@ if __name__ == "__main__":
except Exception as e:
print(
f"Redis ping #{attempt} failed.\n"
f"Error: {str(e)}.\n"
f"Error: {e!s}.\n"
f"Waiting {RETRY_SLEEP_SECONDS}s",
flush=True,
)

View File

@ -136,11 +136,11 @@ changed here.
Defaults to unset, using the documented path in the home directory.
`PAPERLESS_DB_TIMEOUT=<float>`
`PAPERLESS_DB_TIMEOUT=<int>`
: Amount of time for a database connection to wait for the database to
unlock. Mostly applicable for an sqlite based installation, consider
changing to postgresql if you need to increase this.
unlock. Mostly applicable for sqlite based installation. Consider changing
to postgresql if you are having concurrency problems with sqlite.
Defaults to unset, keeping the Django defaults.

View File

@ -2106,6 +2106,10 @@
<context context-type="sourcefile">src/app/components/common/tag/tag.component.html</context>
<context context-type="linenumber">8</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.ts</context>
<context context-type="linenumber">80</context>
</context-group>
</trans-unit>
<trans-unit id="6560126119609945418" datatype="html">
<source>Add tag</source>

View File

@ -29,7 +29,7 @@
<div class="card-body bg-light p-2">
<p class="card-text">
<ng-container *ngIf="document.correspondent">
<a title="Toggle correspondent filter" i18n-title (click)="clickCorrespondent.emit(document.correspondent);$event.stopPropagation()" class="fw-bold btn-link">{{(document.correspondent$ | async)?.name}}</a>:
<a title="Toggle correspondent filter" i18n-title (click)="clickCorrespondent.emit(document.correspondent);$event.stopPropagation()" class="fw-bold btn-link">{{(document.correspondent$ | async)?.name ?? privateName}}</a>:
</ng-container>
{{document.title | documentTitle}}
</p>
@ -41,14 +41,14 @@
<svg class="metadata-icon me-2 text-muted" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#file-earmark"/>
</svg>
<small>{{(document.document_type$ | async)?.name}}</small>
<small>{{(document.document_type$ | async)?.name ?? privateName}}</small>
</button>
<button *ngIf="document.storage_path" type="button" class="list-group-item list-group-item-action bg-transparent ps-0 p-1 border-0" title="Toggle storage path filter" i18n-title
(click)="clickStoragePath.emit(document.storage_path);$event.stopPropagation()">
<svg class="metadata-icon me-2 text-muted" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#folder"/>
</svg>
<small>{{(document.storage_path$ | async)?.name}}</small>
<small>{{(document.storage_path$ | async)?.name ?? privateName}}</small>
</button>
<div class="list-group-item bg-transparent p-0 border-0 d-flex flex-wrap-reverse justify-content-between">
<ng-template #dateTooltip>

View File

@ -76,6 +76,10 @@ export class DocumentCardSmallComponent extends ComponentWithPermissions {
return this.documentService.getPreviewUrl(this.document.id)
}
get privateName() {
return $localize`Private`
}
getTagsLimited$() {
const limit = this.document.notes.length > 0 ? 6 : 7
return this.document.tags$.pipe(

View File

@ -5,7 +5,7 @@ export const environment = {
apiBaseUrl: document.baseURI + 'api/',
apiVersion: '3',
appTitle: 'Paperless-ngx',
version: '1.15.1',
version: '1.15.1-dev',
webSocketHost: window.location.host,
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
webSocketBaseUrl: base_url.pathname + 'ws/',

View File

@ -121,7 +121,7 @@ class BarcodeReader:
if barcode.text:
barcodes.append(barcode.text)
logger.debug(
f"Barcode of type {str(barcode.format)} found: {barcode.text}",
f"Barcode of type {barcode.format} found: {barcode.text}",
)
return barcodes
@ -141,7 +141,7 @@ class BarcodeReader:
decoded_barcode = barcode.data.decode("utf-8")
barcodes.append(decoded_barcode)
logger.debug(
f"Barcode of type {str(barcode.type)} found: {decoded_barcode}",
f"Barcode of type {barcode.type} found: {decoded_barcode}",
)
return barcodes
@ -180,6 +180,9 @@ class BarcodeReader:
with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file:
pdf_file.write(img2pdf.convert(img_file))
# Copy what file stat is possible
shutil.copystat(self.file, self.pdf_file)
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
@ -292,6 +295,9 @@ class BarcodeReader:
savepath = Path(self.temp_dir.name) / output_filename
with open(savepath, "wb") as out:
dst.save(out)
shutil.copystat(self.file, savepath)
document_paths.append(savepath)
return document_paths
@ -342,7 +348,7 @@ class BarcodeReader:
for idx, document_path in enumerate(doc_paths):
if override_name is not None:
newname = f"{str(idx)}_{override_name}"
newname = f"{idx}_{override_name}"
dest = save_to_dir / newname
else:
dest = save_to_dir

View File

@ -69,7 +69,7 @@ class Consumer(LoggingMixin):
status,
message=None,
document_id=None,
):
): # pragma: no cover
payload = {
"filename": os.path.basename(self.filename) if self.filename else None,
"task_id": self.task_id,
@ -326,7 +326,7 @@ class Consumer(LoggingMixin):
dir=settings.SCRATCH_DIR,
)
self.path = Path(tempdir.name) / Path(self.filename)
shutil.copy(self.original_path, self.path)
shutil.copy2(self.original_path, self.path)
# Determine the parser class.
@ -352,7 +352,7 @@ class Consumer(LoggingMixin):
self.run_pre_consume_script()
def progress_callback(current_progress, max_progress):
def progress_callback(current_progress, max_progress): # pragma: no cover
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 50 + 20)
self._send_progress(p, 100, "WORKING")
@ -582,6 +582,7 @@ class Consumer(LoggingMixin):
def _write(self, storage_type, source, target):
with open(source, "rb") as read_file, open(target, "wb") as write_file:
write_file.write(read_file.read())
shutil.copystat(source, target)
def _log_script_outputs(self, completed_process: CompletedProcess):
"""

View File

@ -116,6 +116,8 @@ class DocumentFilterSet(FilterSet):
"created": DATE_KWARGS,
"added": DATE_KWARGS,
"modified": DATE_KWARGS,
"original_filename": CHAR_KWARGS,
"checksum": CHAR_KWARGS,
"correspondent": ["isnull"],
"correspondent__id": ID_KWARGS,
"correspondent__name": CHAR_KWARGS,

View File

@ -64,6 +64,8 @@ def get_schema():
owner_id=NUMERIC(),
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
checksum=TEXT(),
original_filename=TEXT(sortable=True),
)
@ -149,6 +151,8 @@ def update_document(writer: AsyncWriter, doc: Document):
owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum,
original_filename=doc.original_filename,
)
@ -171,91 +175,85 @@ def remove_document_from_index(document):
class DelayedQuery:
param_map = {
"correspondent": ("correspondent", ["id", "id__in", "id__none", "isnull"]),
"document_type": ("type", ["id", "id__in", "id__none", "isnull"]),
"storage_path": ("path", ["id", "id__in", "id__none", "isnull"]),
"owner": ("owner", ["id", "id__in", "id__none", "isnull"]),
"tags": ("tag", ["id__all", "id__in", "id__none"]),
"added": ("added", ["date__lt", "date__gt"]),
"created": ("created", ["date__lt", "date__gt"]),
"checksum": ("checksum", ["icontains", "istartswith"]),
"original_filename": ("original_filename", ["icontains", "istartswith"]),
}
def _get_query(self):
raise NotImplementedError
def _get_query_filter(self):
criterias = []
for k, v in self.query_params.items():
if k == "correspondent__id":
criterias.append(query.Term("correspondent_id", v))
elif k == "correspondent__id__in":
correspondents_in = []
for correspondent_id in v.split(","):
correspondents_in.append(
query.Term("correspondent_id", correspondent_id),
for key, value in self.query_params.items():
# is_tagged is a special case
if key == "is_tagged":
criterias.append(query.Term("has_tag", self.evalBoolean(value)))
continue
# Don't process query params without a filter
if "__" not in key:
continue
# All other query params consist of a parameter and a query filter
param, query_filter = key.split("__", 1)
try:
field, supported_query_filters = self.param_map[param]
except KeyError:
logger.error(f"Unable to build a query filter for parameter {key}")
continue
# We only support certain filters per parameter
if query_filter not in supported_query_filters:
logger.info(
f"Query filter {query_filter} not supported for parameter {param}",
)
criterias.append(query.Or(correspondents_in))
elif k == "correspondent__id__none":
for correspondent_id in v.split(","):
continue
if query_filter == "id":
criterias.append(query.Term(f"{field}_id", value))
elif query_filter == "id__in":
in_filter = []
for object_id in value.split(","):
in_filter.append(
query.Term(f"{field}_id", object_id),
)
criterias.append(query.Or(in_filter))
elif query_filter == "id__none":
for object_id in value.split(","):
criterias.append(
query.Not(query.Term("correspondent_id", correspondent_id)),
query.Not(query.Term(f"{field}_id", object_id)),
)
elif k == "tags__id__all":
for tag_id in v.split(","):
criterias.append(query.Term("tag_id", tag_id))
elif k == "tags__id__none":
for tag_id in v.split(","):
criterias.append(query.Not(query.Term("tag_id", tag_id)))
elif k == "tags__id__in":
tags_in = []
for tag_id in v.split(","):
tags_in.append(query.Term("tag_id", tag_id))
criterias.append(query.Or(tags_in))
elif k == "document_type__id":
criterias.append(query.Term("type_id", v))
elif k == "document_type__id__in":
document_types_in = []
for document_type_id in v.split(","):
document_types_in.append(query.Term("type_id", document_type_id))
criterias.append(query.Or(document_types_in))
elif k == "document_type__id__none":
for document_type_id in v.split(","):
criterias.append(query.Not(query.Term("type_id", document_type_id)))
elif k == "correspondent__isnull":
elif query_filter == "isnull":
criterias.append(
query.Term("has_correspondent", self.evalBoolean(v) is False),
query.Term(f"has_{field}", self.evalBoolean(value) is False),
)
elif k == "is_tagged":
criterias.append(query.Term("has_tag", self.evalBoolean(v)))
elif k == "document_type__isnull":
criterias.append(query.Term("has_type", self.evalBoolean(v) is False))
elif k == "created__date__lt":
elif query_filter == "id__all":
for object_id in value.split(","):
criterias.append(query.Term(f"{field}_id", object_id))
elif query_filter == "date__lt":
criterias.append(
query.DateRange("created", start=None, end=isoparse(v)),
query.DateRange(field, start=None, end=isoparse(value)),
)
elif k == "created__date__gt":
elif query_filter == "date__gt":
criterias.append(
query.DateRange("created", start=isoparse(v), end=None),
query.DateRange(field, start=isoparse(value), end=None),
)
elif query_filter == "icontains":
criterias.append(
query.Term(field, value),
)
elif query_filter == "istartswith":
criterias.append(
query.Prefix(field, value),
)
elif k == "added__date__gt":
criterias.append(query.DateRange("added", start=isoparse(v), end=None))
elif k == "added__date__lt":
criterias.append(query.DateRange("added", start=None, end=isoparse(v)))
elif k == "storage_path__id":
criterias.append(query.Term("path_id", v))
elif k == "storage_path__id__in":
storage_paths_in = []
for storage_path_id in v.split(","):
storage_paths_in.append(query.Term("path_id", storage_path_id))
criterias.append(query.Or(storage_paths_in))
elif k == "storage_path__id__none":
for storage_path_id in v.split(","):
criterias.append(query.Not(query.Term("path_id", storage_path_id)))
elif k == "storage_path__isnull":
criterias.append(query.Term("has_path", self.evalBoolean(v) is False))
elif k == "owner__isnull":
criterias.append(query.Term("has_owner", self.evalBoolean(v) is False))
elif k == "owner__id":
criterias.append(query.Term("owner_id", v))
elif k == "owner__id__in":
owners_in = []
for owner_id in v.split(","):
owners_in.append(query.Term("owner_id", owner_id))
criterias.append(query.Or(owners_in))
elif k == "owner__id__none":
for owner_id in v.split(","):
criterias.append(query.Not(query.Term("owner_id", owner_id)))
user_criterias = get_permissions_criterias(
user=self.user,

View File

@ -346,7 +346,7 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
logger.debug(f"Deleted file {filename}.")
except OSError as e:
logger.warning(
f"While deleting document {str(instance)}, the file "
f"While deleting document {instance!s}, the file "
f"{filename} could not be deleted: {e}",
)
@ -369,13 +369,13 @@ class CannotMoveFilesException(Exception):
def validate_move(instance, old_path, new_path):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logger.fatal(f"Document {str(instance)}: File {old_path} has gone.")
logger.fatal(f"Document {instance!s}: File {old_path} has gone.")
raise CannotMoveFilesException
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logger.warning(
f"Document {str(instance)}: Cannot rename file "
f"Document {instance!s}: Cannot rename file "
f"since target path {new_path} already exists.",
)
raise CannotMoveFilesException

View File

@ -116,7 +116,7 @@ def consume_file(
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
logger.warning(f"ConnectionError on status send: {e!s}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately

View File

@ -420,6 +420,74 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
results = response.data["results"]
self.assertEqual(len(results), 0)
def test_document_checksum_filter(self):
Document.objects.create(
title="none1",
checksum="A",
mime_type="application/pdf",
)
doc2 = Document.objects.create(
title="none2",
checksum="B",
mime_type="application/pdf",
)
Document.objects.create(
title="none3",
checksum="C",
mime_type="application/pdf",
)
response = self.client.get("/api/documents/?checksum__iexact=B")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(len(results), 1)
self.assertEqual(results[0]["id"], doc2.id)
response = self.client.get("/api/documents/?checksum__iexact=X")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(len(results), 0)
def test_document_original_filename_filter(self):
doc1 = Document.objects.create(
title="none1",
checksum="A",
mime_type="application/pdf",
original_filename="docA.pdf",
)
doc2 = Document.objects.create(
title="none2",
checksum="B",
mime_type="application/pdf",
original_filename="docB.pdf",
)
doc3 = Document.objects.create(
title="none3",
checksum="C",
mime_type="application/pdf",
original_filename="docC.pdf",
)
response = self.client.get("/api/documents/?original_filename__iexact=DOCa.pdf")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(len(results), 1)
self.assertEqual(results[0]["id"], doc1.id)
response = self.client.get("/api/documents/?original_filename__iexact=docx.pdf")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(len(results), 0)
response = self.client.get("/api/documents/?original_filename__istartswith=dOc")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(len(results), 3)
self.assertCountEqual(
[results[0]["id"], results[1]["id"], results[2]["id"]],
[doc1.id, doc2.id, doc3.id],
)
def test_documents_title_content_filter(self):
doc1 = Document.objects.create(
title="title A",
@ -1086,17 +1154,19 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
checksum="4",
created=timezone.make_aware(datetime.datetime(2020, 7, 13)),
content="test",
original_filename="doc4.pdf",
)
d4.tags.add(t2)
d5 = Document.objects.create(
checksum="5",
added=timezone.make_aware(datetime.datetime(2020, 7, 13)),
content="test",
original_filename="doc5.pdf",
)
Document.objects.create(checksum="6", content="test2")
d7 = Document.objects.create(checksum="7", storage_path=sp, content="test")
d8 = Document.objects.create(
checksum="8",
checksum="foo",
correspondent=c2,
document_type=dt2,
storage_path=sp2,
@ -1239,6 +1309,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
),
)
self.assertEqual(
search_query("&checksum__icontains=foo"),
[d8.id],
)
self.assertCountEqual(
search_query("&original_filename__istartswith=doc"),
[d4.id, d5.id],
)
def test_search_filtering_respect_owner(self):
"""
GIVEN:
@ -2514,11 +2594,25 @@ class TestApiUiSettings(DirectoriesMixin, APITestCase):
def setUp(self):
super().setUp()
self.test_user = User.objects.create_superuser(username="test")
self.test_user.first_name = "Test"
self.test_user.last_name = "User"
self.test_user.save()
self.client.force_authenticate(user=self.test_user)
def test_api_get_ui_settings(self):
response = self.client.get(self.ENDPOINT, format="json")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertDictEqual(
response.data["user"],
{
"id": self.test_user.id,
"username": self.test_user.username,
"is_superuser": True,
"groups": [],
"first_name": self.test_user.first_name,
"last_name": self.test_user.last_name,
},
)
self.assertDictEqual(
response.data["settings"],
{

View File

@ -12,6 +12,7 @@ from documents.barcodes import BarcodeReader
from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource
from documents.models import Document
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
@ -764,7 +765,7 @@ class TestAsnBarcode(DirectoriesMixin, TestCase):
self.assertEqual(reader.pdf_file, test_file)
self.assertEqual(asn, 123)
def test_scan_file_for_asn_not_existing(self):
def test_scan_file_for_asn_not_found(self):
"""
GIVEN:
- PDF without an ASN barcode
@ -781,6 +782,49 @@ class TestAsnBarcode(DirectoriesMixin, TestCase):
self.assertEqual(reader.pdf_file, test_file)
self.assertEqual(asn, None)
@override_settings(CONSUMER_ENABLE_ASN_BARCODE=True)
def test_scan_file_for_asn_already_exists(self):
"""
GIVEN:
- PDF with an ASN barcode
- ASN value already exists
WHEN:
- File is scanned for barcodes
THEN:
- ASN is retrieved from the document
- Consumption fails
"""
Document.objects.create(
title="WOW",
content="the content",
archive_serial_number=123,
checksum="456",
mime_type="application/pdf",
)
test_file = self.BARCODE_SAMPLE_DIR / "barcode-39-asn-123.pdf"
dst = settings.SCRATCH_DIR / "barcode-39-asn-123.pdf"
shutil.copy(test_file, dst)
with mock.patch("documents.consumer.Consumer._send_progress"):
with self.assertRaises(ConsumerError) as cm, self.assertLogs(
"paperless.consumer",
level="ERROR",
) as logs_cm:
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
)
self.assertIn("Not consuming barcode-39-asn-123.pdf", str(cm.exception))
error_str = logs_cm.output[0]
expected_str = "ERROR:paperless.consumer:Not consuming barcode-39-asn-123.pdf: Given ASN already exists!"
self.assertEqual(expected_str, error_str)
def test_scan_file_for_asn_barcode_invalid(self):
"""
GIVEN:

View File

@ -0,0 +1,219 @@
from dateutil.parser import isoparse
from django.test import TestCase
from whoosh import query
from documents.index import DelayedQuery
from documents.index import get_permissions_criterias
from documents.models import User
class TestDelayedQuery(TestCase):
def setUp(self):
super().setUp()
# all tests run without permission criteria, so has_no_owner query will always
# be appended.
self.has_no_owner = query.Or([query.Term("has_owner", False)])
def _get_testset__id__in(self, param, field):
return (
{f"{param}__id__in": "42,43"},
query.And(
[
query.Or(
[
query.Term(f"{field}_id", "42"),
query.Term(f"{field}_id", "43"),
],
),
self.has_no_owner,
],
),
)
def _get_testset__id__none(self, param, field):
return (
{f"{param}__id__none": "42,43"},
query.And(
[
query.Not(query.Term(f"{field}_id", "42")),
query.Not(query.Term(f"{field}_id", "43")),
self.has_no_owner,
],
),
)
def test_get_permission_criteria(self):
# tests contains touples of user instances and the expected filter
tests = (
(None, [query.Term("has_owner", False)]),
(User(42, username="foo", is_superuser=True), []),
(
User(42, username="foo", is_superuser=False),
[
query.Term("has_owner", False),
query.Term("owner_id", 42),
query.Term("viewer_id", "42"),
],
),
)
for user, expected in tests:
self.assertEqual(get_permissions_criterias(user), expected)
def test_no_query_filters(self):
dq = DelayedQuery(None, {}, None, None)
self.assertEqual(dq._get_query_filter(), self.has_no_owner)
def test_date_query_filters(self):
def _get_testset(param: str):
date_str = "1970-01-01T02:44"
date_obj = isoparse(date_str)
return (
(
{f"{param}__date__lt": date_str},
query.And(
[
query.DateRange(param, start=None, end=date_obj),
self.has_no_owner,
],
),
),
(
{f"{param}__date__gt": date_str},
query.And(
[
query.DateRange(param, start=date_obj, end=None),
self.has_no_owner,
],
),
),
)
query_params = ["created", "added"]
for param in query_params:
for params, expected in _get_testset(param):
dq = DelayedQuery(None, params, None, None)
got = dq._get_query_filter()
self.assertCountEqual(got, expected)
def test_is_tagged_query_filter(self):
tests = (
("True", True),
("true", True),
("1", True),
("False", False),
("false", False),
("0", False),
("foo", False),
)
for param, expected in tests:
dq = DelayedQuery(None, {"is_tagged": param}, None, None)
self.assertEqual(
dq._get_query_filter(),
query.And([query.Term("has_tag", expected), self.has_no_owner]),
)
def test_tags_query_filters(self):
# tests contains touples of query_parameter dics and the expected whoosh query
param = "tags"
field, _ = DelayedQuery.param_map[param]
tests = (
(
{f"{param}__id__all": "42,43"},
query.And(
[
query.Term(f"{field}_id", "42"),
query.Term(f"{field}_id", "43"),
self.has_no_owner,
],
),
),
# tags does not allow __id
(
{f"{param}__id": "42"},
self.has_no_owner,
),
# tags does not allow __isnull
(
{f"{param}__isnull": "true"},
self.has_no_owner,
),
self._get_testset__id__in(param, field),
self._get_testset__id__none(param, field),
)
for params, expected in tests:
dq = DelayedQuery(None, params, None, None)
got = dq._get_query_filter()
self.assertCountEqual(got, expected)
def test_generic_query_filters(self):
def _get_testset(param: str):
field, _ = DelayedQuery.param_map[param]
return (
(
{f"{param}__id": "42"},
query.And(
[
query.Term(f"{field}_id", "42"),
self.has_no_owner,
],
),
),
self._get_testset__id__in(param, field),
self._get_testset__id__none(param, field),
(
{f"{param}__isnull": "true"},
query.And(
[
query.Term(f"has_{field}", False),
self.has_no_owner,
],
),
),
(
{f"{param}__isnull": "false"},
query.And(
[
query.Term(f"has_{field}", True),
self.has_no_owner,
],
),
),
)
query_params = ["correspondent", "document_type", "storage_path", "owner"]
for param in query_params:
for params, expected in _get_testset(param):
dq = DelayedQuery(None, params, None, None)
got = dq._get_query_filter()
self.assertCountEqual(got, expected)
def test_char_query_filter(self):
def _get_testset(param: str):
return (
(
{f"{param}__icontains": "foo"},
query.And(
[
query.Term(f"{param}", "foo"),
self.has_no_owner,
],
),
),
(
{f"{param}__istartswith": "foo"},
query.And(
[
query.Prefix(f"{param}", "foo"),
self.has_no_owner,
],
),
),
)
query_params = ["checksum", "original_filename"]
for param in query_params:
for params, expected in _get_testset(param):
dq = DelayedQuery(None, params, None, None)
got = dq._get_query_filter()
self.assertCountEqual(got, expected)

View File

@ -105,6 +105,20 @@ class FileSystemAssertsMixin:
def assertIsNotDir(self, path: Union[PathLike, str]):
self.assertFalse(Path(path).resolve().is_dir(), f"Dir does exist: {path}")
def assertFilesEqual(
self,
path1: Union[PathLike, str],
path2: Union[PathLike, str],
):
path1 = Path(path1)
path2 = Path(path2)
import hashlib
hash1 = hashlib.sha256(path1.read_bytes()).hexdigest()
hash2 = hashlib.sha256(path2.read_bytes()).hexdigest()
self.assertEqual(hash1, hash2, "File SHA256 mismatch")
class ConsumerProgressMixin:
def setUp(self) -> None:

View File

@ -519,7 +519,7 @@ class DocumentViewSet(
try:
return Response(self.getNotes(doc))
except Exception as e:
logger.warning(f"An error occurred retrieving notes: {str(e)}")
logger.warning(f"An error occurred retrieving notes: {e!s}")
return Response(
{"error": "Error retreiving notes, check logs for more detail."},
)
@ -538,7 +538,7 @@ class DocumentViewSet(
return Response(self.getNotes(doc))
except Exception as e:
logger.warning(f"An error occurred saving note: {str(e)}")
logger.warning(f"An error occurred saving note: {e!s}")
return Response(
{
"error": "Error saving note, check logs for more detail.",
@ -628,7 +628,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
except NotFound:
raise
except Exception as e:
logger.warning(f"An error occurred listing search results: {str(e)}")
logger.warning(f"An error occurred listing search results: {e!s}")
return HttpResponseBadRequest(
"Error listing search results, check logs for more detail.",
)
@ -699,7 +699,7 @@ class BulkEditView(GenericAPIView):
result = method(documents, **parameters)
return Response({"result": result})
except Exception as e:
logger.warning(f"An error occurred performing bulk edit: {str(e)}")
logger.warning(f"An error occurred performing bulk edit: {e!s}")
return HttpResponseBadRequest(
"Error performing bulk edit, check logs for more detail.",
)
@ -1028,16 +1028,23 @@ class UiSettingsView(GenericAPIView):
ui_settings["update_checking"] = {
"backend_setting": settings.ENABLE_UPDATE_CHECK,
}
user_resp = {
"id": user.id,
"username": user.username,
"is_superuser": user.is_superuser,
"groups": list(user.groups.values_list("id", flat=True)),
}
if len(user.first_name) > 0:
user_resp["first_name"] = user.first_name
if len(user.last_name) > 0:
user_resp["last_name"] = user.last_name
# strip <app_label>.
roles = map(lambda perm: re.sub(r"^\w+.", "", perm), user.get_all_permissions())
return Response(
{
"user": {
"id": user.id,
"username": user.username,
"is_superuser": user.is_superuser,
"groups": user.groups.values_list("id", flat=True),
},
"user": user_resp,
"settings": ui_settings,
"permissions": roles,
},

View File

@ -476,24 +476,24 @@ CSRF_COOKIE_NAME = f"{COOKIE_PREFIX}csrftoken"
SESSION_COOKIE_NAME = f"{COOKIE_PREFIX}sessionid"
LANGUAGE_COOKIE_NAME = f"{COOKIE_PREFIX}django_language"
###############################################################################
# Database #
###############################################################################
DATABASES = {
def _parse_db_settings() -> Dict:
databases = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": os.path.join(DATA_DIR, "db.sqlite3"),
"OPTIONS": {},
},
}
if os.getenv("PAPERLESS_DBHOST"):
}
if os.getenv("PAPERLESS_DBHOST"):
# Have sqlite available as a second option for management commands
# This is important when migrating to/from sqlite
DATABASES["sqlite"] = DATABASES["default"].copy()
databases["sqlite"] = databases["default"].copy()
DATABASES["default"] = {
databases["default"] = {
"HOST": os.getenv("PAPERLESS_DBHOST"),
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER", "paperless"),
@ -501,7 +501,7 @@ if os.getenv("PAPERLESS_DBHOST"):
"OPTIONS": {},
}
if os.getenv("PAPERLESS_DBPORT"):
DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")
databases["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")
# Leave room for future extensibility
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
@ -517,12 +517,6 @@ if os.getenv("PAPERLESS_DBHOST"):
},
}
# Silence Django error on old MariaDB versions.
# VARCHAR can support > 255 in modern versions
# https://docs.djangoproject.com/en/4.1/ref/checks/#database
# https://mariadb.com/kb/en/innodb-system-variables/#innodb_large_prefix
SILENCED_SYSTEM_CHECKS = ["mysql.W003"]
else: # Default to PostgresDB
engine = "django.db.backends.postgresql_psycopg2"
options = {
@ -532,13 +526,32 @@ if os.getenv("PAPERLESS_DBHOST"):
"sslkey": os.getenv("PAPERLESS_DBSSLKEY", None),
}
DATABASES["default"]["ENGINE"] = engine
DATABASES["default"]["OPTIONS"].update(options)
databases["default"]["ENGINE"] = engine
databases["default"]["OPTIONS"].update(options)
if os.getenv("PAPERLESS_DB_TIMEOUT") is not None:
DATABASES["default"]["OPTIONS"].update(
{"timeout": float(os.getenv("PAPERLESS_DB_TIMEOUT"))},
if os.getenv("PAPERLESS_DB_TIMEOUT") is not None:
if databases["default"]["ENGINE"] == "django.db.backends.sqlite3":
databases["default"]["OPTIONS"].update(
{"timeout": int(os.getenv("PAPERLESS_DB_TIMEOUT"))},
)
else:
databases["default"]["OPTIONS"].update(
{"connect_timeout": int(os.getenv("PAPERLESS_DB_TIMEOUT"))},
)
databases["sqlite"]["OPTIONS"].update(
{"timeout": int(os.getenv("PAPERLESS_DB_TIMEOUT"))},
)
return databases
DATABASES = _parse_db_settings()
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
# Silence Django error on old MariaDB versions.
# VARCHAR can support > 255 in modern versions
# https://docs.djangoproject.com/en/4.1/ref/checks/#database
# https://mariadb.com/kb/en/innodb-system-variables/#innodb_large_prefix
SILENCED_SYSTEM_CHECKS = ["mysql.W003"]
DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
@ -662,6 +675,8 @@ CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
CELERY_WORKER_SEND_TASK_EVENTS = True
CELERY_TASK_SEND_SENT_EVENT = True
CELERY_SEND_TASK_SENT_EVENT = True
CELERY_BROKER_CONNECTION_RETRY = True
CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP = True
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)

View File

@ -6,6 +6,7 @@ from unittest import mock
from celery.schedules import crontab
from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_db_settings
from paperless.settings import _parse_ignore_dates
from paperless.settings import _parse_redis_url
from paperless.settings import default_threads_per_worker
@ -291,3 +292,60 @@ class TestCeleryScheduleParsing(TestCase):
{},
schedule,
)
class TestDBSettings(TestCase):
def test_db_timeout_with_sqlite(self):
"""
GIVEN:
- PAPERLESS_DB_TIMEOUT is set
WHEN:
- Settings are parsed
THEN:
- PAPERLESS_DB_TIMEOUT set for sqlite
"""
with mock.patch.dict(
os.environ,
{
"PAPERLESS_DB_TIMEOUT": "10",
},
):
databases = _parse_db_settings()
self.assertDictEqual(
{
"timeout": 10.0,
},
databases["default"]["OPTIONS"],
)
def test_db_timeout_with_not_sqlite(self):
"""
GIVEN:
- PAPERLESS_DB_TIMEOUT is set but db is not sqlite
WHEN:
- Settings are parsed
THEN:
- PAPERLESS_DB_TIMEOUT set correctly in non-sqlite db & for fallback sqlite db
"""
with mock.patch.dict(
os.environ,
{
"PAPERLESS_DBHOST": "127.0.0.1",
"PAPERLESS_DB_TIMEOUT": "10",
},
):
databases = _parse_db_settings()
self.assertDictContainsSubset(
{
"connect_timeout": 10.0,
},
databases["default"]["OPTIONS"],
)
self.assertDictEqual(
{
"timeout": 10.0,
},
databases["sqlite"]["OPTIONS"],
)

View File

@ -384,6 +384,8 @@ def make_criterias(rule: MailRule, supports_gmail_labels: bool):
if isinstance(rule_query, dict):
if len(rule_query) or len(criterias):
return AND(**rule_query, **criterias)
else:
return "ALL"
else:
return AND(rule_query, **criterias)
@ -542,7 +544,7 @@ class MailAccountHandler(LoggingMixin):
criterias = make_criterias(rule, supports_gmail_labels)
self.log.debug(
f"Rule {rule}: Searching folder with criteria {str(criterias)}",
f"Rule {rule}: Searching folder with criteria {criterias}",
)
try:

View File

@ -1,18 +1,18 @@
import os
import re
from html import escape
from io import BytesIO
from io import StringIO
from pathlib import Path
from typing import List
import requests
import httpx
from bleach import clean
from bleach import linkify
from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from humanfriendly import format_size
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika import parser
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@ -22,33 +22,15 @@ from documents.parsers import make_thumbnail_from_pdf
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
gotenbergs and sends the html part to a local tika server for text extraction.
Gotenberg and sends the html part to a Tika server for text extraction.
"""
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
logging_name = "paperless.parsing.mail"
_parsed = None
def get_parsed(self, document_path) -> MailMessage:
if not self._parsed:
try:
with open(document_path, "rb") as eml:
self._parsed = MailMessage.from_bytes(eml.read())
except Exception as err:
raise ParseError(
f"Could not parse {document_path}: {err}",
) from err
if not self._parsed.from_values:
self._parsed = None
raise ParseError(
f"Could not parse {document_path}: Missing 'from'",
)
return self._parsed
def get_thumbnail(self, document_path, mime_type, file_name=None):
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(document_path)
@ -58,11 +40,11 @@ class MailDocumentParser(DocumentParser):
self.logging_group,
)
def extract_metadata(self, document_path, mime_type):
def extract_metadata(self, document_path: Path, mime_type: str):
result = []
try:
mail = self.get_parsed(document_path)
mail = self.parse_file_to_message(document_path)
except ParseError as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
@ -106,101 +88,157 @@ class MailDocumentParser(DocumentParser):
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(self, document_path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type: str, file_name=None):
"""
Parses the given .eml into formatted text, based on the decoded email.
"""
def strip_text(text: str):
"""
Reduces the spacing of the given text string
"""
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
mail = self.get_parsed(document_path)
self.text = f"Subject: {mail.subject}\n\n"
self.text += f"From: {mail.from_values.full}\n\n"
self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n"
if len(mail.cc_values) >= 1:
self.text += (
def build_formatted_text(mail_message: MailMessage) -> str:
"""
Constructs a formatted string, based on the given email. Basically tries
to get most of the email content, included front matter, into a nice string
"""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if len(mail.bcc_values) >= 1:
self.text += (
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if len(mail.attachments) >= 1:
if mail_message.attachments:
att = []
for a in mail.attachments:
att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
self.text += f"Attachments: {', '.join(att)}\n\n"
fmt_text += f"Attachments: {', '.join(att)}\n\n"
if mail.html:
self.text += "HTML content: " + strip_text(self.tika_parse(mail.html))
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
self.text += f"\n\n{strip_text(mail.text)}"
fmt_text += f"\n\n{strip_text(mail.text)}"
return fmt_text
self.log.debug(f"Parsing file {document_path.name} into an email")
mail = self.parse_file_to_message(document_path)
self.log.debug("Building formatted text from email")
self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
self.log.debug("Creating a PDF from the email")
self.archive_path = self.generate_pdf(mail)
@staticmethod
def parse_file_to_message(filepath: Path) -> MailMessage:
"""
Parses the given .eml file into a MailMessage object
"""
try:
with filepath.open("rb") as eml:
parsed = MailMessage.from_bytes(eml.read())
if parsed.from_values is None:
raise ParseError(
f"Could not parse {filepath}: Missing 'from'",
)
except Exception as err:
raise ParseError(
f"Could not parse {filepath}: {err}",
) from err
return parsed
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
try:
parsed = parser.from_buffer(html, self.tika_server)
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if "X-TIKA:content" in parsed.data:
return parsed.data["X-TIKA:content"].strip()
return ""
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at "
f"{self.tika_server}: {err}",
) from err
if parsed["content"]:
return parsed["content"]
else:
return ""
def generate_pdf(self, document_path):
pdf_collection = []
def generate_pdf(self, mail_message: MailMessage) -> Path:
archive_path = Path(self.tempdir) / "merged.pdf"
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
# If no HTML content, create the PDF from the message
# Otherwise, create 2 PDFs and merge them with Gotenberg
if not mail_message.html:
archive_path.write_bytes(mail_pdf_file.read_bytes())
else:
url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
pdf_path = os.path.join(self.tempdir, "merged.pdf")
mail = self.get_parsed(document_path)
pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail)))
if not mail.html:
with open(pdf_path, "wb") as file:
file.write(pdf_collection[0][1])
file.close()
return pdf_path
else:
pdf_collection.append(
(
"2_html.pdf",
self.generate_pdf_from_html(mail.html, mail.attachments),
),
pdf_of_html_content = self.generate_pdf_from_html(
mail_message.html,
mail_message.attachments,
)
files = {}
for name, content in pdf_collection:
files[name] = (name, BytesIO(content))
headers = {}
pdf_collection = {
"1_mail.pdf": ("1_mail.pdf", mail_pdf_file, "application/pdf"),
"2_html.pdf": ("2_html.pdf", pdf_of_html_content, "application/pdf"),
}
try:
response = requests.post(url_merge, files=files, headers=headers)
# Open a handle to each file, replacing the tuple
for filename in pdf_collection:
file_multi_part = pdf_collection[filename]
pdf_collection[filename] = (
file_multi_part[0],
file_multi_part[1].open("rb"),
file_multi_part[2],
)
response = httpx.post(url_merge, files=pdf_collection)
response.raise_for_status() # ensure we notice bad responses
archive_path.write_bytes(response.content)
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}") from err
raise ParseError(
f"Error while merging email HTML into PDF: {err}",
) from err
finally:
for filename in pdf_collection:
file_multi_part_handle = pdf_collection[filename][1]
file_multi_part_handle.close()
with open(pdf_path, "wb") as file:
file.write(response.content)
file.close()
return archive_path
return pdf_path
def mail_to_html(self, mail: MailMessage) -> Path:
"""
Converts the given email into an HTML file, formatted
based on the given template
"""
@staticmethod
def mail_to_html(mail: MailMessage) -> StringIO:
data = {}
def clean_html(text: str):
def clean_html(text: str) -> str:
"""
Attempts to clean, escape and linkify the given HTML string
"""
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if type(text) != str:
@ -211,6 +249,8 @@ class MailDocumentParser(DocumentParser):
text = text.replace("\n", "<br>")
return text
data = {}
data["subject"] = clean_html(mail.subject)
if data["subject"]:
data["subject_label"] = "Subject"
@ -237,27 +277,33 @@ class MailDocumentParser(DocumentParser):
data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M"))
data["content"] = clean_html(mail.text.strip())
html = StringIO()
from django.template.loader import render_to_string
rendered = render_to_string("email_msg_template.html", context=data)
html_file = Path(self.tempdir) / "email_as_html.html"
html_file.write_text(render_to_string("email_msg_template.html", context=data))
html.write(rendered)
html.seek(0)
return html_file
return html
def generate_pdf_from_mail(self, mail):
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
"""
Creates a PDF based on the given email, using the email's values in a
an HTML template
"""
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting mail to PDF")
css_file = os.path.join(os.path.dirname(__file__), "templates/output.css")
css_file = Path(__file__).parent / "templates" / "output.css"
email_html_file = self.mail_to_html(mail)
with open(css_file, "rb") as css_handle:
print(css_file)
print(email_html_file)
with css_file.open("rb") as css_handle, email_html_file.open(
"rb",
) as email_html_handle:
files = {
"html": ("index.html", self.mail_to_html(mail)),
"css": ("output.css", css_handle),
"html": ("index.html", email_html_handle, "text/html"),
"css": ("output.css", css_handle, "text/css"),
}
headers = {}
data = {
@ -280,7 +326,7 @@ class MailDocumentParser(DocumentParser):
data["pdfFormat"] = "PDF/A-3b"
try:
response = requests.post(
response = httpx.post(
url,
files=files,
headers=headers,
@ -289,13 +335,23 @@ class MailDocumentParser(DocumentParser):
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
f"Error while converting email to PDF: {err}",
) from err
return response.content
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
email_as_pdf_file.write_bytes(response.content)
return email_as_pdf_file
def generate_pdf_from_html(
self,
orig_html: str,
attachments: List[MailAttachment],
) -> Path:
"""
Generates a PDF file based on the HTML and attachments of the email
"""
@staticmethod
def transform_inline_html(html, attachments):
def clean_html_script(text: str):
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
@ -304,28 +360,36 @@ class MailDocumentParser(DocumentParser):
text = compiled_close.sub("</div", text)
return text
html_clean = clean_html_script(html)
files = []
for a in attachments:
name_cid = "cid:" + a.content_id
name_clean = "".join(e for e in name_cid if e.isalnum())
files.append((name_clean, BytesIO(a.payload)))
html_clean = html_clean.replace(name_cid, name_clean)
files.append(("index.html", StringIO(html_clean)))
return files
def generate_pdf_from_html(self, orig_html, attachments):
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log.info("Converting html to PDF")
files = {}
for name, file in self.transform_inline_html(orig_html, attachments):
files[name] = (name, file)
tempdir = Path(self.tempdir)
html_clean = clean_html_script(orig_html)
files = {}
for attachment in attachments:
# Clean the attachment name to be valid
name_cid = f"cid:{attachment.content_id}"
name_clean = "".join(e for e in name_cid if e.isalnum())
# Write attachment payload to a temp file
temp_file = tempdir / name_clean
temp_file.write_bytes(attachment.payload)
# Store the attachment for upload
files[name_clean] = (name_clean, temp_file, attachment.content_type)
# Replace as needed the name with the clean name
html_clean = html_clean.replace(name_cid, name_clean)
# Now store the cleaned up HTML version
html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean)
files["index.html"] = ("index.html", html_clean_file, "text/html")
headers = {}
data = {
"marginTop": "0.1",
"marginBottom": "0.1",
@ -336,14 +400,29 @@ class MailDocumentParser(DocumentParser):
"scale": "1.0",
}
try:
response = requests.post(
# Open a handle to each file, replacing the tuple
for filename in files:
file_multi_part = files[filename]
files[filename] = (
file_multi_part[0],
file_multi_part[1].open("rb"),
file_multi_part[2],
)
response = httpx.post(
url,
files=files,
headers=headers,
data=data,
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}") from err
finally:
# Ensure all file handles as closed
for filename in files:
file_multi_part_handle = files[filename][1]
file_multi_part_handle.close()
return response.content
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf

View File

@ -721,6 +721,31 @@ class TestMail(
self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
def test_handle_mail_account_move_no_filters(self):
account = MailAccount.objects.create(
name="test",
imap_server="",
username="admin",
password="secret",
)
_ = MailRule.objects.create(
name="testrule",
account=account,
action=MailRule.MailAction.MOVE,
action_parameter="spam",
maximum_age=0,
)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
self.mail_account_handler.handle_mail_account(account)
self.apply_mail_actions()
self.assertEqual(len(self.bogus_mailbox.messages), 0)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 3)
def test_handle_mail_account_tag(self):
account = MailAccount.objects.create(
name="test",

View File

@ -1,24 +1,39 @@
import datetime
import os
from pathlib import Path
from unittest import mock
import httpx
from django.test import TestCase
from documents.parsers import ParseError
from documents.tests.utils import FileSystemAssertsMixin
from paperless_mail.parsers import MailDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestParser(FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
class BaseMailParserTestCase(TestCase):
"""
Basic setup for the below test cases
"""
SAMPLE_DIR = Path(__file__).parent / "samples"
def setUp(self) -> None:
super().setUp()
self.parser = MailDocumentParser(logging_group=None)
def tearDown(self) -> None:
super().tearDown()
self.parser.cleanup()
def test_get_parsed_missing_file(self):
class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
"""
Tests around reading a file and parsing it into a
MailMessage
"""
def test_parse_error_missing_file(self):
"""
GIVEN:
- Fresh parser
@ -28,13 +43,17 @@ class TestParser(FileSystemAssertsMixin, TestCase):
- An Exception is thrown
"""
# Check if exception is raised when parsing fails.
test_file = self.SAMPLE_DIR / "doesntexist.eml"
self.assertIsNotFile(test_file)
self.assertRaises(
ParseError,
self.parser.get_parsed,
os.path.join(self.SAMPLE_FILES, "na"),
self.parser.parse,
test_file,
"messages/rfc822",
)
def test_get_parsed_broken_file(self):
def test_parse_error_invalid_email(self):
"""
GIVEN:
- Fresh parser
@ -46,11 +65,12 @@ class TestParser(FileSystemAssertsMixin, TestCase):
# Check if exception is raised when the mail is faulty.
self.assertRaises(
ParseError,
self.parser.get_parsed,
os.path.join(self.SAMPLE_FILES, "broken.eml"),
self.parser.parse,
self.SAMPLE_DIR / "broken.eml",
"messages/rfc822",
)
def test_get_parsed_simple_text_mail(self):
def test_parse_simple_text_email_file(self):
"""
GIVEN:
- Fresh parser
@ -60,8 +80,8 @@ class TestParser(FileSystemAssertsMixin, TestCase):
- The content of the mail should be available in the parse result.
"""
# Parse Test file and check relevant content
parsed1 = self.parser.get_parsed(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
parsed1 = self.parser.parse_file_to_message(
self.SAMPLE_DIR / "simple_text.eml",
)
self.assertEqual(parsed1.date.year, 2022)
@ -76,58 +96,11 @@ class TestParser(FileSystemAssertsMixin, TestCase):
self.assertEqual(parsed1.text, "This is just a simple Text Mail.\n")
self.assertEqual(parsed1.to, ("some@one.de",))
def test_get_parsed_reparse(self):
"""
GIVEN:
- An E-Mail was parsed
WHEN:
- Another .eml file should be parsed
THEN:
- The parser should not retry to parse and return the old results
"""
# Parse Test file and check relevant content
parsed1 = self.parser.get_parsed(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
)
# Check if same parsed object as before is returned, even if another file is given.
parsed2 = self.parser.get_parsed(
os.path.join(os.path.join(self.SAMPLE_FILES, "html.eml")),
)
self.assertEqual(parsed1, parsed2)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
@mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf")
def test_get_thumbnail(
self,
mock_make_thumbnail_from_pdf: mock.MagicMock,
mock_generate_pdf: mock.MagicMock,
):
class TestEmailMetadataExtraction(BaseMailParserTestCase):
"""
GIVEN:
- An E-Mail was parsed
WHEN:
- The Thumbnail is requested
THEN:
- The parser should call the functions which generate the thumbnail
Tests extraction of metadata from an email
"""
mocked_return = "Passing the return value through.."
mock_make_thumbnail_from_pdf.return_value = mocked_return
mock_generate_pdf.return_value = "Mocked return value.."
thumb = self.parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
"message/rfc822",
)
self.assertEqual(
self.parser.archive_path,
mock_make_thumbnail_from_pdf.call_args_list[0].args[0],
)
self.assertEqual(
self.parser.tempdir,
mock_make_thumbnail_from_pdf.call_args_list[0].args[1],
)
self.assertEqual(mocked_return, thumb)
def test_extract_metadata_fail(self):
"""
@ -157,7 +130,7 @@ class TestParser(FileSystemAssertsMixin, TestCase):
"""
# Validate Metadata parsing returns the expected results
metadata = self.parser.extract_metadata(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
)
@ -287,90 +260,53 @@ class TestParser(FileSystemAssertsMixin, TestCase):
metadata,
)
def test_parse_na(self):
class TestEmailThumbnailGenerate(BaseMailParserTestCase):
"""
Tests the correct generation of an thumbnail for an email
"""
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
@mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf")
def test_get_thumbnail(
self,
mock_make_thumbnail_from_pdf: mock.MagicMock,
mock_generate_pdf: mock.MagicMock,
):
"""
GIVEN:
- Fresh start
- An E-Mail was parsed
WHEN:
- parsing is attempted with nonexistent file
- The Thumbnail is requested
THEN:
- Exception is thrown
- The parser should call the functions which generate the thumbnail
"""
# Check if exception is raised when parsing fails.
self.assertRaises(
ParseError,
self.parser.parse,
os.path.join(self.SAMPLE_FILES, "na"),
mocked_return = "Passing the return value through.."
mock_make_thumbnail_from_pdf.return_value = mocked_return
mock_generate_pdf.return_value = "Mocked return value.."
test_file = self.SAMPLE_DIR / "simple_text.eml"
thumb = self.parser.get_thumbnail(
test_file,
"message/rfc822",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.tika_parse")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is done with html mail
THEN:
- Tika is called, parsed information from non html parts is available
"""
# Validate parsing returns the expected results
text_expected = "Subject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return\n\nSome Text and an embedded image."
mock_tika_parse.return_value = "tika return"
self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822")
self.assertEqual(text_expected, self.parser.text)
self.assertEqual(
datetime.datetime(
2022,
10,
15,
11,
23,
19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
),
self.parser.date,
mock_generate_pdf.assert_called_once_with(
test_file,
)
mock_make_thumbnail_from_pdf.assert_called_once_with(
"Mocked return value..",
self.parser.tempdir,
None,
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_simple_eml(self, m: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is done with non html mail
THEN:
- parsed information is available
"""
# Validate parsing returns the expected results
self.assertEqual(mocked_return, thumb)
self.parser.parse(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
"message/rfc822",
)
text_expected = "Subject: Simple Text Mail\n\nFrom: Some One <mail@someserver.de>\n\nTo: some@one.de\n\nCC: asdasd@æsdasd.de, asdadasdasdasda.asdasd@æsdasd.de\n\nBCC: fdf@fvf.de\n\n\n\nThis is just a simple Text Mail."
self.assertEqual(text_expected, self.parser.text)
self.assertEqual(
datetime.datetime(
2022,
10,
12,
21,
40,
43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
),
self.parser.date,
)
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
m.assert_called()
@mock.patch("paperless_mail.parsers.parser.from_buffer")
def test_tika_parse_unsuccessful(self, mock_from_buffer: mock.MagicMock):
class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
def test_tika_parse_unsuccessful(self):
"""
GIVEN:
- Fresh start
@ -380,12 +316,13 @@ class TestParser(FileSystemAssertsMixin, TestCase):
- the parser should return an empty string
"""
# Check unsuccessful parsing
mock_from_buffer.return_value = {"content": None}
parsed = self.parser.tika_parse(None)
self.httpx_mock.add_response(
json={"Content-Type": "text/html", "X-TIKA:Parsed-By": []},
)
parsed = self.parser.tika_parse("None")
self.assertEqual("", parsed)
@mock.patch("paperless_mail.parsers.parser.from_buffer")
def test_tika_parse(self, mock_from_buffer: mock.MagicMock):
def test_tika_parse(self):
"""
GIVEN:
- Fresh start
@ -397,14 +334,18 @@ class TestParser(FileSystemAssertsMixin, TestCase):
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
expected_text = "Some Text"
# Check successful parsing
mock_from_buffer.return_value = {"content": expected_text}
self.httpx_mock.add_response(
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": expected_text,
},
)
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
mock_from_buffer.assert_called_with(html, self.parser.tika_server)
self.assertIn(self.parser.tika_server, str(self.httpx_mock.get_request().url))
@mock.patch("paperless_mail.parsers.parser.from_buffer")
def test_tika_parse_exception(self, mock_from_buffer: mock.MagicMock):
def test_tika_parse_exception(self):
"""
GIVEN:
- Fresh start
@ -415,11 +356,8 @@ class TestParser(FileSystemAssertsMixin, TestCase):
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
# Check ParseError
def my_side_effect():
raise Exception("Test")
self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
mock_from_buffer.side_effect = my_side_effect
self.assertRaises(ParseError, self.parser.tika_parse, html)
def test_tika_parse_unreachable(self):
@ -437,258 +375,285 @@ class TestParser(FileSystemAssertsMixin, TestCase):
self.parser.tika_server = ""
self.assertRaises(ParseError, self.parser.tika_parse, html)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_parse_error(self, m: mock.MagicMock, n: mock.MagicMock):
class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase):
def test_parse_no_file(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation is requested but gotenberg can not be reached
- parsing is attempted with nonexistent file
THEN:
- Exception is thrown
"""
# Check if exception is raised when parsing fails.
self.assertRaises(
ParseError,
self.parser.parse,
self.SAMPLE_DIR / "na.eml",
"message/rfc822",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_eml_simple(self, mock_generate_pdf: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is done with non html mail
THEN:
- parsed information is available
"""
# Validate parsing returns the expected results
self.parser.parse(
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
)
text_expected = (
"Subject: Simple Text Mail\n\n"
"From: Some One <mail@someserver.de>\n\n"
"To: some@one.de\n\n"
"CC: asdasd@æsdasd.de, asdadasdasdasda.asdasd@æsdasd.de\n\n"
"BCC: fdf@fvf.de\n\n"
"\n\nThis is just a simple Text Mail."
)
self.assertEqual(text_expected, self.parser.text)
self.assertEqual(
datetime.datetime(
2022,
10,
12,
21,
40,
43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
),
self.parser.date,
)
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
mock_generate_pdf.assert_called()
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_eml_html(self, mock_generate_pdf: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is done with html mail
THEN:
- Tika is called, parsed information from non html parts is available
"""
# Validate parsing returns the expected results
text_expected = (
"Subject: HTML Message\n\n"
"From: Name <someone@example.de>\n\n"
"To: someone@example.de\n\n"
"Attachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\n"
"HTML content: tika return\n\n"
"Some Text and an embedded image."
)
self.httpx_mock.add_response(
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "tika return",
},
)
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
mock_generate_pdf.assert_called_once()
self.assertEqual(text_expected, self.parser.text)
self.assertEqual(
datetime.datetime(
2022,
10,
15,
11,
23,
19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
),
self.parser.date,
)
def test_generate_pdf_parse_error(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation is requested but gotenberg fails
THEN:
- a ParseError Exception is thrown
"""
m.return_value = b""
n.return_value = b""
self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
# Check if exception is raised when the pdf can not be created.
self.parser.gotenberg_server = ""
self.assertRaises(
ParseError,
self.parser.generate_pdf,
os.path.join(self.SAMPLE_FILES, "html.eml"),
self.parser.parse,
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
)
def test_generate_pdf_exception(self):
def test_generate_pdf_simple_email(self):
"""
GIVEN:
- Fresh start
- Simple text email with no HTML content
WHEN:
- pdf generation is requested but parsing throws an exception
- Email is parsed
THEN:
- a ParseError Exception is thrown
- Gotenberg is called to generate a PDF from HTML
- Archive file is generated
"""
# Check if exception is raised when the mail can not be parsed.
self.assertRaises(
ParseError,
self.parser.generate_pdf,
os.path.join(self.SAMPLE_FILES, "broken.eml"),
self.httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
content=(self.SAMPLE_DIR / "simple_text.eml.pdf").read_bytes(),
)
@mock.patch("paperless_mail.parsers.requests.post")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf(
self,
mock_generate_pdf_from_html: mock.MagicMock,
mock_generate_pdf_from_mail: mock.MagicMock,
mock_post: mock.MagicMock,
):
self.parser.parse(self.SAMPLE_DIR / "simple_text.eml", "message/rfc822")
self.assertIsNotNone(self.parser.archive_path)
def test_generate_pdf_html_email(self):
"""
GIVEN:
- Fresh start
- email with HTML content
WHEN:
- pdf generation is requested
- Email is parsed
THEN:
- gotenberg is called and the resulting file is returned
- Gotenberg is called to generate a PDF from HTML
- Gotenberg is used to merge the two PDFs
- Archive file is generated
"""
mock_generate_pdf_from_mail.return_value = b"Mail Return"
mock_generate_pdf_from_html.return_value = b"HTML Return"
self.httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "This is some Tika HTML text",
},
)
self.httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
)
self.httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge",
method="POST",
content=b"Pretend merged PDF content",
)
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
mock_response = mock.MagicMock()
mock_response.content = b"Content"
mock_post.return_value = mock_response
pdf_path = self.parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "html.eml"))
self.assertIsFile(pdf_path)
self.assertIsNotNone(self.parser.archive_path)
mock_generate_pdf_from_mail.assert_called_once_with(
self.parser.get_parsed(None),
def test_generate_pdf_html_email_html_to_pdf_failure(self):
"""
GIVEN:
- email with HTML content
WHEN:
- Email is parsed
- Conversion of email HTML content to PDF fails
THEN:
- ParseError is raised
"""
self.httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "This is some Tika HTML text",
},
)
mock_generate_pdf_from_html.assert_called_once_with(
self.parser.get_parsed(None).html,
self.parser.get_parsed(None).attachments,
self.httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
)
self.assertEqual(
self.parser.gotenberg_server + "/forms/pdfengines/merge",
mock_post.call_args.args[0],
)
self.assertEqual({}, mock_post.call_args.kwargs["headers"])
self.assertEqual(
b"Mail Return",
mock_post.call_args.kwargs["files"]["1_mail.pdf"][1].read(),
)
self.assertEqual(
b"HTML Return",
mock_post.call_args.kwargs["files"]["2_html.pdf"][1].read(),
self.httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
status_code=httpx.codes.INTERNAL_SERVER_ERROR,
)
with self.assertRaises(ParseError):
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
mock_response.raise_for_status.assert_called_once()
with open(pdf_path, "rb") as file:
self.assertEqual(b"Content", file.read())
def test_generate_pdf_html_email_merge_failure(self):
"""
GIVEN:
- email with HTML content
WHEN:
- Email is parsed
- Merging of PDFs fails
THEN:
- ParseError is raised
"""
self.httpx_mock.add_response(
url="http://localhost:9998/tika/text",
method="PUT",
json={
"Content-Type": "text/html",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "This is some Tika HTML text",
},
)
self.httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html",
method="POST",
content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(),
)
self.httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge",
method="POST",
status_code=httpx.codes.INTERNAL_SERVER_ERROR,
)
with self.assertRaises(ParseError):
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822")
def test_mail_to_html(self):
"""
GIVEN:
- Fresh start
- Email message with HTML content
WHEN:
- conversion from eml to html is requested
- Email is parsed
THEN:
- html should be returned
- Resulting HTML is as expected
"""
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
html_handle = self.parser.mail_to_html(mail)
html_received = html_handle.read()
mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml")
html_file = self.parser.mail_to_html(mail)
expected_html_file = self.SAMPLE_DIR / "html.eml.html"
with open(
os.path.join(self.SAMPLE_FILES, "html.eml.html"),
) as html_expected_handle:
html_expected = html_expected_handle.read()
self.assertHTMLEqual(expected_html_file.read_text(), html_file.read_text())
self.assertHTMLEqual(html_expected, html_received)
@mock.patch("paperless_mail.parsers.requests.post")
@mock.patch("paperless_mail.parsers.MailDocumentParser.mail_to_html")
def test_generate_pdf_from_mail(
self,
mock_mail_to_html: mock.MagicMock,
mock_post: mock.MagicMock,
):
"""
GIVEN:
- Fresh start
- Email message with HTML content
WHEN:
- conversion of PDF from .eml is requested
- Email is parsed
THEN:
- gotenberg should be called with valid intermediary html files, the resulting pdf is returned
- Gotenberg is used to convert HTML to PDF
"""
mock_response = mock.MagicMock()
mock_response.content = b"Content"
mock_post.return_value = mock_response
mock_mail_to_html.return_value = "Testresponse"
self.httpx_mock.add_response(content=b"Content")
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml")
retval = self.parser.generate_pdf_from_mail(mail)
self.assertEqual(b"Content", retval)
self.assertEqual(b"Content", retval.read_bytes())
request = self.httpx_mock.get_request()
mock_mail_to_html.assert_called_once_with(mail)
self.assertEqual(
str(request.url),
self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0],
)
self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
self.assertDictEqual(
{
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
"pdfFormat": "PDF/A-2b",
},
mock_post.call_args.kwargs["data"],
)
self.assertEqual(
"Testresponse",
mock_post.call_args.kwargs["files"]["html"][1],
)
self.assertEqual(
"output.css",
mock_post.call_args.kwargs["files"]["css"][0],
)
mock_response.raise_for_status.assert_called_once()
def test_transform_inline_html(self):
"""
GIVEN:
- Fresh start
WHEN:
- transforming of html content from an email with an inline image attachment is requested
THEN:
- html is returned and sanitized
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.parser.transform_inline_html(html, attachments)
resulting_html = result[-1][1].read()
self.assertTrue(result[-1][0] == "index.html")
self.assertIn(result[0][0], resulting_html)
self.assertNotIn("<script", resulting_html.lower())
@mock.patch("paperless_mail.parsers.requests.post")
def test_generate_pdf_from_html(self, mock_post: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- generating pdf from html with inline attachments is attempted
THEN:
- gotenberg is called with the correct parameters and the resulting pdf is returned
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
mock_response = mock.MagicMock()
mock_response.content = b"Content"
mock_post.return_value = mock_response
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.parser.generate_pdf_from_html(html, attachments)
self.assertEqual(
self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0],
)
self.assertDictEqual({}, mock_post.call_args.kwargs["headers"])
self.assertDictEqual(
{
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
},
mock_post.call_args.kwargs["data"],
)
# read to assert it is a file like object.
mock_post.call_args.kwargs["files"]["cidpart1pNdUSz0sD3NqVtPgexamplede"][
1
].read()
mock_post.call_args.kwargs["files"]["index.html"][1].read()
mock_response.raise_for_status.assert_called_once()
self.assertEqual(b"Content", result)

View File

@ -1,29 +1,80 @@
import os
import time
from unittest import mock
from urllib.error import HTTPError
from urllib.request import urlopen
import httpx
import pytest
from django.test import TestCase
from imagehash import average_hash
from pdfminer.high_level import extract_text
from PIL import Image
from documents.parsers import run_convert
from documents.tests.utils import FileSystemAssertsMixin
from paperless_mail.parsers import MailDocumentParser
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
class TestParserLive(FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
self.content_type = "image/png"
def setUp(self) -> None:
self.parser = MailDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestUrlCanary(TestCase):
"""
Verify certain URLs are still available so testing is valid still
"""
def test_online_image_exception_on_not_available(self):
"""
GIVEN:
- Fresh start
WHEN:
- nonexistent image is requested
THEN:
- An exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if we can detect if is not
available anymore.
"""
with self.assertRaises(httpx.HTTPStatusError) as cm:
resp = httpx.get(
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
)
resp.raise_for_status()
self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND)
def test_is_online_image_still_available(self):
"""
GIVEN:
- Fresh start
WHEN:
- A public image used in the html sample file is requested
THEN:
- No exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if it is still there
"""
# Now check the URL used in samples/sample.html
resp = httpx.get("https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png")
resp.raise_for_status()
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
@staticmethod
def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}"
@ -54,13 +105,18 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
result = method_or_callable(*args)
succeeded = True
except Exception as e:
except httpx.HTTPError as e:
raise
# Retry on HTTP errors
print(f"{e} during try #{retry_count}", flush=True)
retry_count = retry_count + 1
time.sleep(retry_time)
retry_time = retry_time * 2.0
except Exception:
# Not on other error
raise
self.assertTrue(
succeeded,
@ -79,17 +135,14 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
THEN:
- The returned thumbnail image file is as expected
"""
mock_generate_pdf.return_value = os.path.join(
self.SAMPLE_FILES,
"simple_text.eml.pdf",
)
mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf"
thumb = self.parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
)
self.assertIsFile(thumb)
expected = os.path.join(self.SAMPLE_FILES, "simple_text.eml.pdf.webp")
expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp"
self.assertEqual(
self.imagehash(thumb),
@ -97,10 +150,6 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
f"Created Thumbnail {thumb} differs from expected file {expected}",
)
@pytest.mark.skipif(
"TIKA_LIVE" not in os.environ,
reason="No tika server",
)
def test_tika_parse_successful(self):
"""
GIVEN:
@ -117,27 +166,6 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
@pytest.mark.skipif(
"TIKA_LIVE" not in os.environ,
reason="No tika server",
)
def test_tika_parse_unsuccessful(self):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing fails
THEN:
- the parser should return an empty string
"""
# Check unsuccessful parsing
parsed = self.parser.tika_parse(None)
self.assertEqual("", parsed)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_gotenberg_merging(
@ -153,15 +181,16 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
THEN:
- gotenberg is called to merge files and the resulting file is returned
"""
with open(os.path.join(self.SAMPLE_FILES, "first.pdf"), "rb") as first:
mock_generate_pdf_from_mail.return_value = first.read()
mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf"
mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf"
with open(os.path.join(self.SAMPLE_FILES, "second.pdf"), "rb") as second:
mock_generate_pdf_from_html.return_value = second.read()
msg = self.parser.parse_file_to_message(
self.SAMPLE_DIR / "html.eml",
)
pdf_path = self.util_call_with_backoff(
self.parser.generate_pdf,
[os.path.join(self.SAMPLE_FILES, "html.eml")],
[msg],
)
self.assertIsFile(pdf_path)
@ -169,38 +198,9 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
expected = (
"first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
)
self.assertEqual(expected, extracted)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_generate_pdf_from_mail_no_convert(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from simple eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and contains the expected text.
"""
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
pdf_path = os.path.join(self.parser.tempdir, "html.eml.pdf")
with open(pdf_path, "wb") as file:
file.write(
self.util_call_with_backoff(self.parser.generate_pdf_from_mail, [mail]),
)
extracted = extract_text(pdf_path)
expected = extract_text(os.path.join(self.SAMPLE_FILES, "html.eml.pdf"))
self.assertEqual(expected, extracted)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_generate_pdf_from_mail(self):
"""
GIVEN:
@ -210,193 +210,32 @@ class TestParserLive(FileSystemAssertsMixin, TestCase):
THEN:
- gotenberg is called and the resulting file is returned and look as expected.
"""
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
pdf_path = os.path.join(self.parser.tempdir, "html.eml.pdf")
with open(pdf_path, "wb") as file:
file.write(
self.util_call_with_backoff(self.parser.generate_pdf_from_mail, [mail]),
self.util_call_with_backoff(
self.parser.parse,
[self.SAMPLE_DIR / "html.eml", "message/rfc822"],
)
converted = os.path.join(
self.parser.tempdir,
"html.eml.pdf.webp",
# Check the archive PDF
archive_path = self.parser.get_archive_path()
archive_text = extract_text(archive_path)
expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf")
# Archive includes the HTML content, so use in
self.assertIn(expected_archive_text, archive_text)
# Check the thumbnail
generated_thumbnail = self.parser.get_thumbnail(
self.SAMPLE_DIR / "html.eml",
"message/rfc822",
)
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{pdf_path}", # Do net define an index to convert all pages.
output_file=converted,
logging_group=None,
)
self.assertIsFile(converted)
thumb_hash = self.imagehash(converted)
generated_thumbnail_hash = self.imagehash(generated_thumbnail)
# The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(
os.path.join(self.SAMPLE_FILES, "html.eml.pdf.webp"),
)
self.assertEqual(
thumb_hash,
expected_hash,
f"PDF looks different. Check if {converted} looks weird.",
)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_generate_pdf_from_html_no_convert(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from html eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and contains the expected text.
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.util_call_with_backoff(
self.parser.generate_pdf_from_html,
[html, attachments],
)
pdf_path = os.path.join(self.parser.tempdir, "sample.html.pdf")
with open(pdf_path, "wb") as file:
file.write(result)
extracted = extract_text(pdf_path)
expected = extract_text(os.path.join(self.SAMPLE_FILES, "sample.html.pdf"))
self.assertEqual(expected, extracted)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_generate_pdf_from_html(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from html eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and look as expected.
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.util_call_with_backoff(
self.parser.generate_pdf_from_html,
[html, attachments],
)
pdf_path = os.path.join(self.parser.tempdir, "sample.html.pdf")
with open(pdf_path, "wb") as file:
file.write(result)
converted = os.path.join(self.parser.tempdir, "sample.html.pdf.webp")
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{pdf_path}", # Do net define an index to convert all pages.
output_file=converted,
logging_group=None,
)
self.assertIsFile(converted)
thumb_hash = self.imagehash(converted)
# The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(
os.path.join(self.SAMPLE_FILES, "sample.html.pdf.webp"),
)
expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp")
self.assertEqual(
thumb_hash,
generated_thumbnail_hash,
expected_hash,
f"PDF looks different. Check if {converted} looks weird. "
f"If Rick Astley is shown, Gotenberg loads from web which is bad for Mail content.",
f"PDF looks different. Check if {generated_thumbnail} looks weird.",
)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_online_image_exception_on_not_available(self):
"""
GIVEN:
- Fresh start
WHEN:
- nonexistent image is requested
THEN:
- An exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if we can detect if is not
available anymore.
"""
# Start by Testing if nonexistent URL really throws an Exception
self.assertRaises(
HTTPError,
urlopen,
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_is_online_image_still_available(self):
"""
GIVEN:
- Fresh start
WHEN:
- A public image used in the html sample file is requested
THEN:
- No exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if it is still there
"""
# Now check the URL used in samples/sample.html
urlopen("https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png")

View File

@ -335,7 +335,7 @@ class RasterisedDocumentParser(DocumentParser):
self.text = text_original
except (NoTextFoundException, InputFileError) as e:
self.log.warning(
f"Encountered an error while running OCR: {str(e)}. "
f"Encountered an error while running OCR: {e!s}. "
f"Attempting force OCR to get the text.",
)
@ -370,11 +370,11 @@ class RasterisedDocumentParser(DocumentParser):
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.

View File

@ -1,10 +1,9 @@
import os
from pathlib import Path
import dateutil.parser
import requests
import httpx
from django.conf import settings
from tika import parser
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@ -29,55 +28,38 @@ class TikaDocumentParser(DocumentParser):
)
def extract_metadata(self, document_path, mime_type):
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.metadata.from_file(document_path, mime_type)
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed.data[key],
}
for key in parsed.data
]
except Exception as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
)
return []
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed["metadata"][key],
}
for key in parsed["metadata"]
]
def parse(self, document_path: Path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type: str, file_name=None):
self.log.info(f"Sending {document_path} to Tika server")
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.tika.as_text.from_file(document_path, mime_type)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self.text = parsed["content"].strip()
try:
self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
except Exception as e:
self.log.warning(
f"Unable to extract date for document {document_path}: {e}",
)
self.text = parsed.content.strip()
self.date = parsed.metadata.created
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):
@ -106,7 +88,7 @@ class TikaDocumentParser(DocumentParser):
data["pdfFormat"] = "PDF/A-3b"
try:
response = requests.post(url, files=files, headers=headers, data=data)
response = httpx.post(url, files=files, headers=headers, data=data)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(

View File

@ -9,7 +9,10 @@ from django.test import TestCase
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestTikaParserAgainstServer(TestCase):
"""
This test case tests the Tika parsing against a live tika server,
@ -25,7 +28,7 @@ class TestTikaParserAgainstServer(TestCase):
def tearDown(self) -> None:
self.parser.cleanup()
def try_parse_with_wait(self, test_file, mime_type):
def try_parse_with_wait(self, test_file: Path, mime_type: str):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.

View File

@ -5,34 +5,38 @@ from unittest import mock
from django.test import TestCase
from django.test import override_settings
from requests import Response
from httpx import Request
from httpx import Response
from rest_framework import status
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestTikaParser(TestCase):
class TestTikaParser(HttpxMockMixin, TestCase):
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_parse(self, post, from_file):
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_200_OK
post.return_value = response
def test_parse(self):
# Pretend parse response
self.httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "the content",
"dcterms:created": "2020-11-21T00:00:00",
},
)
# Pretend convert to PDF response
self.httpx_mock.add_response(content=b"PDF document")
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
self.parser.parse(file, "application/vnd.oasis.opendocument.text")
self.assertEqual(self.parser.text, "the content")
@ -42,26 +46,28 @@ class TestTikaParser(TestCase):
self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))
@mock.patch("paperless_tika.parsers.parser.from_file")
def test_metadata(self, from_file):
from_file.return_value = {
"metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"},
}
def test_metadata(self):
self.httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
"Some-key": "value",
"dcterms:created": "2020-11-21T00:00:00",
},
)
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
metadata = self.parser.extract_metadata(
file,
"application/vnd.oasis.opendocument.text",
)
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
self.assertTrue("Some-key" in [m["key"] for m in metadata])
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_convert_failure(self, post, from_file):
def test_convert_failure(self):
"""
GIVEN:
- Document needs to be converted to PDF
@ -70,22 +76,16 @@ class TestTikaParser(TestCase):
THEN:
- Parse error is raised
"""
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
post.return_value = response
# Pretend convert to PDF response
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
with self.assertRaises(ParseError):
self.parser.convert_to_pdf(file, None)
@mock.patch("paperless_tika.parsers.requests.post")
@mock.patch("paperless_tika.parsers.httpx.post")
def test_request_pdf_a_format(self, post: mock.Mock):
"""
GIVEN:
@ -95,12 +95,11 @@ class TestTikaParser(TestCase):
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_200_OK
response = Response(status_code=status.HTTP_200_OK)
response.request = Request("POST", "/somewhere/")
post.return_value = response
for setting, expected_key in [

View File

@ -0,0 +1,11 @@
import pytest
from pytest_httpx import HTTPXMock
class HttpxMockMixin:
@pytest.fixture(autouse=True)
def httpx_mock_auto(self, httpx_mock: HTTPXMock):
"""
Workaround for allowing use of a fixture with unittest style testing
"""
self.httpx_mock = httpx_mock