Merge branch 'dev'

This commit is contained in:
Michael Shamoon 2023-01-01 14:11:46 -08:00
commit 729f25c435
11 changed files with 185 additions and 73 deletions

View File

@ -165,6 +165,7 @@ COPY [ \
"docker/docker-prepare.sh", \
"docker/paperless_cmd.sh", \
"docker/wait-for-redis.py", \
"docker/env-from-file.sh", \
"docker/management_script.sh", \
"docker/flower-conditional.sh", \
"docker/install_management_commands.sh", \
@ -184,6 +185,8 @@ RUN set -eux \
&& chmod 755 /sbin/docker-prepare.sh \
&& mv wait-for-redis.py /sbin/wait-for-redis.py \
&& chmod 755 /sbin/wait-for-redis.py \
&& mv env-from-file.sh /sbin/env-from-file.sh \
&& chmod 755 /sbin/env-from-file.sh \
&& mv paperless_cmd.sh /usr/local/bin/paperless_cmd.sh \
&& chmod 755 /usr/local/bin/paperless_cmd.sh \
&& mv flower-conditional.sh /usr/local/bin/flower-conditional.sh \

View File

@ -2,37 +2,6 @@
set -e
# Adapted from:
# https://github.com/docker-library/postgres/blob/master/docker-entrypoint.sh
# usage: file_env VAR
# ie: file_env 'XYZ_DB_PASSWORD' will allow for "$XYZ_DB_PASSWORD_FILE" to
# fill in the value of "$XYZ_DB_PASSWORD" from a file, especially for Docker's
# secrets feature
file_env() {
local -r var="$1"
local -r fileVar="${var}_FILE"
# Basic validation
if [ "${!var:-}" ] && [ "${!fileVar:-}" ]; then
echo >&2 "error: both $var and $fileVar are set (but are exclusive)"
exit 1
fi
# Only export var if the _FILE exists
if [ "${!fileVar:-}" ]; then
# And the file exists
if [[ -f ${!fileVar} ]]; then
echo "Setting ${var} from file"
val="$(< "${!fileVar}")"
export "$var"="$val"
else
echo "File ${!fileVar} doesn't exist"
exit 1
fi
fi
}
# Source: https://github.com/sameersbn/docker-gitlab/
map_uidgid() {
local -r usermap_original_uid=$(id -u paperless)
@ -96,19 +65,11 @@ custom_container_init() {
initialize() {
# Setup environment from secrets before anything else
for env_var in \
PAPERLESS_DBUSER \
PAPERLESS_DBPASS \
PAPERLESS_SECRET_KEY \
PAPERLESS_AUTO_LOGIN_USERNAME \
PAPERLESS_ADMIN_USER \
PAPERLESS_ADMIN_MAIL \
PAPERLESS_ADMIN_PASSWORD \
PAPERLESS_REDIS; do
# Check for a version of this var with _FILE appended
# and convert the contents to the env var value
file_env ${env_var}
done
# Check for a version of this var with _FILE appended
# and convert the contents to the env var value
# Source it so export is persistent
# shellcheck disable=SC1091
source /sbin/env-from-file.sh
# Change the user and group IDs if needed
map_uidgid

39
docker/env-from-file.sh Normal file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env bash
# Scans the environment variables for those with the suffix _FILE
# When located, checks the file exists, and exports the contents
# of the file as the same name, minus the suffix
# This allows the use of Docker secrets or mounted files
# to fill in any of the settings configurable via environment
# variables
set -eu
for line in $(printenv)
do
# Extract the name of the environment variable
env_name=${line%%=*}
# Check if it ends in "_FILE"
if [[ ${env_name} == *_FILE ]]; then
# Extract the value of the environment
env_value=${line#*=}
# Check the file exists
if [[ -f ${env_value} ]]; then
# Trim off the _FILE suffix
non_file_env_name=${env_name%"_FILE"}
echo "Setting ${non_file_env_name} from file"
# Reads the value from th file
val="$(< "${!env_name}")"
# Sets the normal name to the read file contents
export "${non_file_env_name}"="${val}"
else
echo "File ${env_value} doesn't exist"
exit 1
fi
fi
done

View File

@ -3,6 +3,9 @@
set -e
cd /usr/src/paperless/src/
# This ensures environment is setup
# shellcheck disable=SC1091
source /sbin/env-from-file.sh
if [[ $(id -u) == 0 ]] ;
then

View File

@ -31,7 +31,7 @@
[editing]="true"
[multiple]="true"
[applyOnClose]="applyOnClose"
(open)="openTagsDropdown()"
(opened)="openTagsDropdown()"
[(selectionModel)]="tagSelectionModel"
(apply)="setTags($event)">
</app-filterable-dropdown>
@ -40,7 +40,7 @@
[items]="correspondents"
[editing]="true"
[applyOnClose]="applyOnClose"
(open)="openCorrespondentDropdown()"
(opened)="openCorrespondentDropdown()"
[(selectionModel)]="correspondentSelectionModel"
(apply)="setCorrespondents($event)">
</app-filterable-dropdown>
@ -49,7 +49,7 @@
[items]="documentTypes"
[editing]="true"
[applyOnClose]="applyOnClose"
(open)="openDocumentTypeDropdown()"
(opened)="openDocumentTypeDropdown()"
[(selectionModel)]="documentTypeSelectionModel"
(apply)="setDocumentTypes($event)">
</app-filterable-dropdown>
@ -58,7 +58,7 @@
[items]="storagePaths"
[editing]="true"
[applyOnClose]="applyOnClose"
(open)="openStoragePathDropdown()"
(opened)="openStoragePathDropdown()"
[(selectionModel)]="storagePathsSelectionModel"
(apply)="setStoragePaths($event)">
</app-filterable-dropdown>

View File

@ -30,27 +30,27 @@
[(selectionModel)]="tagSelectionModel"
(selectionModelChange)="updateRules()"
[multiple]="true"
(open)="onTagsDropdownOpen()"
(opened)="onTagsDropdownOpen()"
[allowSelectNone]="true"></app-filterable-dropdown>
<app-filterable-dropdown class="flex-fill" title="Correspondent" icon="person-fill" i18n-title
filterPlaceholder="Filter correspondents" i18n-filterPlaceholder
[items]="correspondents"
[(selectionModel)]="correspondentSelectionModel"
(selectionModelChange)="updateRules()"
(open)="onCorrespondentDropdownOpen()"
(opened)="onCorrespondentDropdownOpen()"
[allowSelectNone]="true"></app-filterable-dropdown>
<app-filterable-dropdown class="flex-fill" title="Document type" icon="file-earmark-fill" i18n-title
filterPlaceholder="Filter document types" i18n-filterPlaceholder
[items]="documentTypes"
[(selectionModel)]="documentTypeSelectionModel"
(open)="onDocumentTypeDropdownOpen()"
(opened)="onDocumentTypeDropdownOpen()"
(selectionModelChange)="updateRules()"
[allowSelectNone]="true"></app-filterable-dropdown>
<app-filterable-dropdown class="me-2 flex-fill" title="Storage path" icon="folder-fill" i18n-title
filterPlaceholder="Filter storage paths" i18n-filterPlaceholder
[items]="storagePaths"
[(selectionModel)]="storagePathSelectionModel"
(open)="onStoragePathDropdownOpen()"
(opened)="onStoragePathDropdownOpen()"
(selectionModelChange)="updateRules()"
[allowSelectNone]="true"></app-filterable-dropdown>
</div>

View File

@ -5,7 +5,7 @@ export const environment = {
apiBaseUrl: document.baseURI + 'api/',
apiVersion: '2',
appTitle: 'Paperless-ngx',
version: '1.11.2',
version: '1.11.2-dev',
webSocketHost: window.location.host,
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
webSocketBaseUrl: base_url.pathname + 'ws/',

View File

@ -793,6 +793,8 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload(self, m):
m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
"rb",
@ -816,6 +818,8 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_empty_metadata(self, m):
m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
"rb",
@ -839,6 +843,8 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_invalid_form(self, m):
m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
"rb",
@ -853,6 +859,8 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_invalid_file(self, m):
m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.zip"),
"rb",
@ -866,6 +874,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_title(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
"rb",
@ -884,6 +895,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_correspondent(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
c = Correspondent.objects.create(name="test-corres")
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -903,6 +917,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_correspondent(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
"rb",
@ -917,6 +934,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_document_type(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
dt = DocumentType.objects.create(name="invoice")
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -936,6 +956,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_document_type(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
"rb",
@ -950,6 +973,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_tags(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
with open(
@ -970,6 +996,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_tags(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
with open(
@ -986,6 +1015,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_created(self, async_task):
async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
created = datetime.datetime(
2022,
5,
@ -2948,6 +2980,59 @@ class TestTasks(APITestCase):
self.assertEqual(returned_task2["status"], celery.states.PENDING)
self.assertEqual(returned_task2["task_file_name"], task2.task_file_name)
def test_get_single_task_status(self):
"""
GIVEN
- Query parameter for a valid task ID
WHEN:
- API call is made to get task status
THEN:
- Single task data is returned
"""
id1 = str(uuid.uuid4())
task1 = PaperlessTask.objects.create(
task_id=id1,
task_file_name="task_one.pdf",
)
_ = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_two.pdf",
)
response = self.client.get(self.ENDPOINT + f"?task_id={id1}")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_task1 = response.data[0]
self.assertEqual(returned_task1["task_id"], task1.task_id)
def test_get_single_task_status_not_valid(self):
"""
GIVEN
- Query parameter for a non-existent task ID
WHEN:
- API call is made to get task status
THEN:
- No task data is returned
"""
task1 = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_one.pdf",
)
_ = PaperlessTask.objects.create(
task_id=str(uuid.uuid4()),
task_file_name="task_two.pdf",
)
response = self.client.get(self.ENDPOINT + "?task_id=bad-task-id")
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 0)
def test_acknowledge_tasks(self):
"""
GIVEN:

View File

@ -617,7 +617,7 @@ class PostDocumentView(GenericAPIView):
task_id = str(uuid.uuid4())
consume_file.delay(
async_task = consume_file.delay(
temp_filename,
override_filename=doc_name,
override_title=title,
@ -628,7 +628,7 @@ class PostDocumentView(GenericAPIView):
override_created=created,
)
return Response("OK")
return Response(async_task.id)
class SelectionDataView(GenericAPIView):
@ -886,13 +886,18 @@ class TasksViewSet(ReadOnlyModelViewSet):
permission_classes = (IsAuthenticated,)
serializer_class = TasksViewSerializer
queryset = (
PaperlessTask.objects.filter(
acknowledged=False,
def get_queryset(self):
queryset = (
PaperlessTask.objects.filter(
acknowledged=False,
)
.order_by("date_created")
.reverse()
)
.order_by("date_created")
.reverse()
)
task_id = self.request.query_params.get("task_id")
if task_id is not None:
queryset = PaperlessTask.objects.filter(task_id=task_id)
return queryset
class AcknowledgeTasksView(GenericAPIView):

View File

@ -8,6 +8,8 @@ import requests
from bleach import clean
from bleach import linkify
from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from documents.parsers import DocumentParser
from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
@ -135,7 +137,11 @@ class MailDocumentParser(DocumentParser):
self.text += f"\n\n{strip_text(mail.text)}"
self.date = mail.date
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
def tika_parse(self, html: str):

View File

@ -1,6 +1,8 @@
import json
import os
import re
from pathlib import Path
from typing import Optional
from django.conf import settings
from documents.parsers import DocumentParser
@ -99,7 +101,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("warning", f"Error while calculating DPI for image {image}: {e}")
return None
def extract_text(self, sidecar_file, pdf_file):
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
@ -139,11 +141,15 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Detected language {lang}")
if lang in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}:
if (
lang
in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException()
return stripped
except RtlLanguageException:
@ -275,7 +281,7 @@ class RasterisedDocumentParser(DocumentParser):
return ocrmypdf_args
def parse(self, document_path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type, file_name=None):
# This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1"
@ -300,8 +306,8 @@ class RasterisedDocumentParser(DocumentParser):
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError
archive_path = os.path.join(self.tempdir, "archive.pdf")
sidecar_file = os.path.join(self.tempdir, "sidecar.txt")
archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
args = self.construct_ocrmypdf_parameters(
document_path,
@ -335,8 +341,12 @@ class RasterisedDocumentParser(DocumentParser):
f"Attempting force OCR to get the text.",
)
archive_path_fallback = os.path.join(self.tempdir, "archive-fallback.pdf")
sidecar_file_fallback = os.path.join(self.tempdir, "sidecar-fallback.txt")
archive_path_fallback = Path(
os.path.join(self.tempdir, "archive-fallback.pdf"),
)
sidecar_file_fallback = Path(
os.path.join(self.tempdir, "sidecar-fallback.txt"),
)
# Attempt to run OCR with safe settings.