Merge branch 'dev' into feature-permissions

This commit is contained in:
shamoon 2023-02-14 11:32:37 -08:00
commit 32754defef
31 changed files with 327 additions and 90 deletions

View File

@ -161,7 +161,7 @@ jobs:
pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra
- -
name: Upload coverage to Codecov name: Upload coverage to Codecov
if: matrix.python-version == ${{ env.DEFAULT_PYTHON_VERSION }} if: ${{ matrix.python-version == env.DEFAULT_PYTHON_VERSION }}
uses: codecov/codecov-action@v3 uses: codecov/codecov-action@v3
with: with:
# not required for public repos, but intermittently fails otherwise # not required for public repos, but intermittently fails otherwise

View File

@ -59,7 +59,7 @@ services:
- gotenberg - gotenberg
- tika - tika
ports: ports:
- 8000:8000 - "8000:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000"] test: ["CMD", "curl", "-f", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -53,7 +53,7 @@ services:
- db - db
- broker - broker
ports: ports:
- 8000:8000 - "8000:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000"] test: ["CMD", "curl", "-f", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -53,7 +53,7 @@ services:
- db - db
- broker - broker
ports: ports:
- 8010:8000 - "8010:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -57,7 +57,7 @@ services:
- gotenberg - gotenberg
- tika - tika
ports: ports:
- 8000:8000 - "8000:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -51,7 +51,7 @@ services:
- db - db
- broker - broker
ports: ports:
- 8000:8000 - "8000:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -46,7 +46,7 @@ services:
- gotenberg - gotenberg
- tika - tika
ports: ports:
- 8000:8000 - "8000:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -37,7 +37,7 @@ services:
depends_on: depends_on:
- broker - broker
ports: ports:
- 8000:8000 - "8000:8000"
healthcheck: healthcheck:
test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"]
interval: 30s interval: 30s

View File

@ -3,5 +3,10 @@
echo "Checking if we should start flower..." echo "Checking if we should start flower..."
if [[ -n "${PAPERLESS_ENABLE_FLOWER}" ]]; then if [[ -n "${PAPERLESS_ENABLE_FLOWER}" ]]; then
celery --app paperless flower # Small delay to allow celery to be up first
echo "Starting flower in 5s"
sleep 5
celery --app paperless flower --conf=/usr/src/paperless/src/paperless/flowerconfig.py
else
echo "Not starting flower"
fi fi

View File

@ -346,7 +346,7 @@ read -r -a OCR_LANGUAGES_ARRAY <<< "${_split_langs}"
fi fi
} > docker-compose.env } > docker-compose.env
sed -i "s/- 8000:8000/- $PORT:8000/g" docker-compose.yml sed -i "s/- \"8000:8000\"/- \"$PORT:8000\"/g" docker-compose.yml
sed -i "s#- \./consume:/usr/src/paperless/consume#- $CONSUME_FOLDER:/usr/src/paperless/consume#g" docker-compose.yml sed -i "s#- \./consume:/usr/src/paperless/consume#- $CONSUME_FOLDER:/usr/src/paperless/consume#g" docker-compose.yml

View File

@ -18,7 +18,7 @@
(blur)="onBlur()"> (blur)="onBlur()">
<ng-template ng-label-tmp let-item="item"> <ng-template ng-label-tmp let-item="item">
<span class="tag-wrap tag-wrap-delete" (click)="removeTag(item.id)"> <span class="tag-wrap tag-wrap-delete" (mousedown)="removeTag($event, item.id)">
<svg width="1.2em" height="1em" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg"> <svg width="1.2em" height="1em" viewBox="0 0 16 16" fill="currentColor" xmlns="http://www.w3.org/2000/svg">
<use xlink:href="assets/bootstrap-icons.svg#x"/> <use xlink:href="assets/bootstrap-icons.svg#x"/>
</svg> </svg>

View File

@ -65,7 +65,7 @@ export class TagsComponent implements OnInit, ControlValueAccessor {
private _lastSearchTerm: string private _lastSearchTerm: string
getTag(id) { getTag(id: number) {
if (this.tags) { if (this.tags) {
return this.tags.find((tag) => tag.id == id) return this.tags.find((tag) => tag.id == id)
} else { } else {
@ -73,8 +73,12 @@ export class TagsComponent implements OnInit, ControlValueAccessor {
} }
} }
removeTag(id) { removeTag(event: PointerEvent, id: number) {
if (this.disabled) return if (this.disabled) return
// prevent opening dropdown
event.stopImmediatePropagation()
let index = this.value.indexOf(id) let index = this.value.indexOf(id)
if (index > -1) { if (index > -1) {
let oldValue = this.value let oldValue = this.value

View File

@ -63,7 +63,7 @@
<div class="row"> <div class="row">
<div class="col mb-4"> <div class="col-md-6 col-xl-4 mb-4">
<form [formGroup]='documentForm' (ngSubmit)="save()"> <form [formGroup]='documentForm' (ngSubmit)="save()">

View File

@ -22,6 +22,15 @@
--page-margin: 1px 0 20px; --page-margin: 1px 0 20px;
} }
::ng-deep .ng-select-taggable {
max-width: calc(100% - 46px); // fudge factor for ng-select button width
}
.btn-group .dropdown-toggle-split {
border-top-right-radius: inherit;
border-bottom-right-radius: inherit;
}
.password-prompt { .password-prompt {
position: absolute; position: absolute;
top: 30%; top: 30%;

View File

@ -10,7 +10,7 @@
</div> </div>
</div> </div>
<ngb-pagination class="col-auto" [pageSize]="25" [collectionSize]="collectionSize" [(page)]="page" (pageChange)="reloadData()" aria-label="Default pagination"></ngb-pagination> <ngb-pagination class="col-auto" [pageSize]="25" [collectionSize]="collectionSize" [(page)]="page" [maxSize]="5" (pageChange)="reloadData()" aria-label="Default pagination"></ngb-pagination>
</div> </div>
<table class="table table-striped align-middle border shadow-sm"> <table class="table table-striped align-middle border shadow-sm">
@ -72,5 +72,5 @@
<div class="d-flex"> <div class="d-flex">
<div i18n *ngIf="collectionSize > 0">{collectionSize, plural, =1 {One {{typeName}}} other {{{collectionSize || 0}} total {{typeNamePlural}}}}</div> <div i18n *ngIf="collectionSize > 0">{collectionSize, plural, =1 {One {{typeName}}} other {{{collectionSize || 0}} total {{typeNamePlural}}}}</div>
<ngb-pagination *ngIf="collectionSize > 20" class="ms-auto" [pageSize]="25" [collectionSize]="collectionSize" [(page)]="page" (pageChange)="reloadData()" aria-label="Default pagination"></ngb-pagination> <ngb-pagination *ngIf="collectionSize > 20" class="ms-auto" [pageSize]="25" [collectionSize]="collectionSize" [(page)]="page" [maxSize]="5" (pageChange)="reloadData()" aria-label="Default pagination"></ngb-pagination>
</div> </div>

View File

@ -325,11 +325,10 @@ def save_to_dir(
Optionally rename the file. Optionally rename the file.
""" """
if os.path.isfile(filepath) and os.path.isdir(target_dir): if os.path.isfile(filepath) and os.path.isdir(target_dir):
dst = shutil.copy(filepath, target_dir) dest = target_dir
logging.debug(f"saved {str(filepath)} to {str(dst)}") if newname is not None:
if newname: dest = os.path.join(dest, newname)
dst_new = os.path.join(target_dir, newname) shutil.copy(filepath, dest)
logger.debug(f"moving {str(dst)} to {str(dst_new)}") logging.debug(f"saved {str(filepath)} to {str(dest)}")
os.rename(dst, dst_new)
else: else:
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")

View File

@ -346,6 +346,7 @@ class Consumer(LoggingMixin):
mime_type, mime_type,
) )
if not parser_class: if not parser_class:
tempdir.cleanup()
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
# Notify all listeners that we're going to do some work. # Notify all listeners that we're going to do some work.
@ -404,6 +405,7 @@ class Consumer(LoggingMixin):
except ParseError as e: except ParseError as e:
document_parser.cleanup() document_parser.cleanup()
tempdir.cleanup()
self._fail( self._fail(
str(e), str(e),
f"Error while consuming document {self.filename}: {e}", f"Error while consuming document {self.filename}: {e}",

View File

@ -779,11 +779,17 @@ class StoragePathSerializer(MatchingModelSerializer, OwnedObjectSerializer):
document_type="document_type", document_type="document_type",
created="created", created="created",
created_year="created_year", created_year="created_year",
created_year_short="created_year_short",
created_month="created_month", created_month="created_month",
created_month_name="created_month_name",
created_month_name_short="created_month_name_short",
created_day="created_day", created_day="created_day",
added="added", added="added",
added_year="added_year", added_year="added_year",
added_year_short="added_year_short",
added_month="added_month", added_month="added_month",
added_month_name="added_month_name",
added_month_name_short="added_month_name_short",
added_day="added_day", added_day="added_day",
asn="asn", asn="asn",
tags="tags", tags="tags",

View File

@ -130,6 +130,18 @@ def consume_file(
) )
if document_list: if document_list:
# If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags
# from subdirectories
try:
# is_relative_to would be nicer, but new in 3.9
_ = path.relative_to(settings.SCRATCH_DIR)
save_to_dir = settings.CONSUMPTION_DIR
except ValueError:
save_to_dir = path.parent
for n, document in enumerate(document_list): for n, document in enumerate(document_list):
# save to consumption dir # save to consumption dir
# rename it to the original filename with number prefix # rename it to the original filename with number prefix
@ -138,23 +150,18 @@ def consume_file(
else: else:
newname = None newname = None
# If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags
# from subdirectories
try:
# is_relative_to would be nicer, but new in 3.9
_ = path.relative_to(settings.SCRATCH_DIR)
save_to_dir = settings.CONSUMPTION_DIR
except ValueError:
save_to_dir = path.parent
barcodes.save_to_dir( barcodes.save_to_dir(
document, document,
newname=newname, newname=newname,
target_dir=save_to_dir, target_dir=save_to_dir,
) )
# Split file has been copied safely, remove it
os.remove(document)
# And clean up the directory as well, now it's empty
shutil.rmtree(os.path.dirname(document_list[0]))
# Delete the PDF file which was split # Delete the PDF file which was split
os.remove(doc_barcode_info.pdf_path) os.remove(doc_barcode_info.pdf_path)

View File

@ -125,28 +125,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
response = self.client.get("/api/documents/", format="json") response = self.client.get("/api/documents/", format="json")
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
results_full = response.data["results"] results_full = response.data["results"]
self.assertTrue("content" in results_full[0]) self.assertIn("content", results_full[0])
self.assertTrue("id" in results_full[0]) self.assertIn("id", results_full[0])
response = self.client.get("/api/documents/?fields=id", format="json") response = self.client.get("/api/documents/?fields=id", format="json")
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
results = response.data["results"] results = response.data["results"]
self.assertFalse("content" in results[0]) self.assertFalse("content" in results[0])
self.assertTrue("id" in results[0]) self.assertIn("id", results[0])
self.assertEqual(len(results[0]), 1) self.assertEqual(len(results[0]), 1)
response = self.client.get("/api/documents/?fields=content", format="json") response = self.client.get("/api/documents/?fields=content", format="json")
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
results = response.data["results"] results = response.data["results"]
self.assertTrue("content" in results[0]) self.assertIn("content", results[0])
self.assertFalse("id" in results[0]) self.assertFalse("id" in results[0])
self.assertEqual(len(results[0]), 1) self.assertEqual(len(results[0]), 1)
response = self.client.get("/api/documents/?fields=id,content", format="json") response = self.client.get("/api/documents/?fields=id,content", format="json")
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
results = response.data["results"] results = response.data["results"]
self.assertTrue("content" in results[0]) self.assertIn("content", results[0])
self.assertTrue("id" in results[0]) self.assertIn("id", results[0])
self.assertEqual(len(results[0]), 2) self.assertEqual(len(results[0]), 2)
response = self.client.get( response = self.client.get(
@ -156,7 +156,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
results = response.data["results"] results = response.data["results"]
self.assertFalse("content" in results[0]) self.assertFalse("content" in results[0])
self.assertTrue("id" in results[0]) self.assertIn("id", results[0])
self.assertEqual(len(results[0]), 1) self.assertEqual(len(results[0]), 1)
response = self.client.get("/api/documents/?fields=", format="json") response = self.client.get("/api/documents/?fields=", format="json")
@ -3291,8 +3291,32 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400) self.assertEqual(response.status_code, 400)
self.assertEqual(StoragePath.objects.count(), 1) self.assertEqual(StoragePath.objects.count(), 1)
def test_api_storage_path_placeholders(self):
"""
GIVEN:
- API request to create a storage path with placeholders
- Storage path is valid
WHEN:
- API is called
THEN:
- Correct HTTP response
- New storage path is created
"""
response = self.client.post(
self.ENDPOINT,
json.dumps(
{
"name": "Storage path with placeholders",
"path": "{title}/{correspondent}/{document_type}/{created}/{created_year}/{created_year_short}/{created_month}/{created_month_name}/{created_month_name_short}/{created_day}/{added}/{added_year}/{added_year_short}/{added_month}/{added_month_name}/{added_month_name_short}/{added_day}/{asn}/{tags}/{tag_list}/",
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, 201)
self.assertEqual(StoragePath.objects.count(), 2)
class TestTasks(APITestCase):
class TestTasks(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/tasks/" ENDPOINT = "/api/tasks/"
ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/" ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"

View File

@ -847,13 +847,11 @@ class PreConsumeTestCase(TestCase):
self.assertEqual(command[0], script.name) self.assertEqual(command[0], script.name)
self.assertEqual(command[1], "path-to-file") self.assertEqual(command[1], "path-to-file")
self.assertDictContainsSubset( subset = {
{ "DOCUMENT_SOURCE_PATH": c.original_path,
"DOCUMENT_SOURCE_PATH": c.original_path, "DOCUMENT_WORKING_PATH": c.path,
"DOCUMENT_WORKING_PATH": c.path, }
}, self.assertDictEqual(environment, {**environment, **subset})
environment,
)
@mock.patch("documents.consumer.Consumer.log") @mock.patch("documents.consumer.Consumer.log")
def test_script_with_output(self, mocked_log): def test_script_with_output(self, mocked_log):
@ -983,16 +981,15 @@ class PostConsumeTestCase(TestCase):
self.assertEqual(command[7], "my_bank") self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"]) self.assertCountEqual(command[8].split(","), ["a", "b"])
self.assertDictContainsSubset( subset = {
{ "DOCUMENT_ID": str(doc.pk),
"DOCUMENT_ID": str(doc.pk), "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/", "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", "DOCUMENT_CORRESPONDENT": "my_bank",
"DOCUMENT_CORRESPONDENT": "my_bank", "DOCUMENT_TAGS": "a,b",
"DOCUMENT_TAGS": "a,b", }
},
environment, self.assertDictEqual(environment, {**environment, **subset})
)
def test_script_exit_non_zero(self): def test_script_exit_non_zero(self):
""" """

View File

@ -25,7 +25,7 @@ class TestImporter(TestCase):
cmd.manifest = [{"model": "documents.document"}] cmd.manifest = [{"model": "documents.document"}]
with self.assertRaises(CommandError) as cm: with self.assertRaises(CommandError) as cm:
cmd._check_manifest() cmd._check_manifest()
self.assertTrue("The manifest file contains a record" in str(cm.exception)) self.assertIn("The manifest file contains a record", str(cm.exception))
cmd.manifest = [ cmd.manifest = [
{"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"}, {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
@ -33,6 +33,7 @@ class TestImporter(TestCase):
# self.assertRaises(CommandError, cmd._check_manifest) # self.assertRaises(CommandError, cmd._check_manifest)
with self.assertRaises(CommandError) as cm: with self.assertRaises(CommandError) as cm:
cmd._check_manifest() cmd._check_manifest()
self.assertTrue( self.assertIn(
'The manifest file refers to "noexist.pdf"' in str(cm.exception), 'The manifest file refers to "noexist.pdf"',
str(cm.exception),
) )

View File

@ -1,6 +1,8 @@
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from unittest import mock from unittest import mock
from django.apps import apps
from django.test import override_settings
from django.test import TestCase from django.test import TestCase
from documents.parsers import get_default_file_extension from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported from documents.parsers import is_file_ext_supported
from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser from paperless_text.parsers import TextDocumentParser
from paperless_tika.parsers import TikaDocumentParser
class TestParserDiscovery(TestCase): class TestParserDiscovery(TestCase):
@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase):
class TestParserAvailability(TestCase): class TestParserAvailability(TestCase):
def test_file_extensions(self): def test_tesseract_parser(self):
"""
GIVEN:
- Various mime types
WHEN:
- The parser class is instantiated
THEN:
- The Tesseract based parser is return
"""
supported_mimes_and_exts = [ supported_mimes_and_exts = [
("application/pdf", ".pdf"), ("application/pdf", ".pdf"),
("image/png", ".png"), ("image/png", ".png"),
("image/jpeg", ".jpg"), ("image/jpeg", ".jpg"),
("image/tiff", ".tif"), ("image/tiff", ".tif"),
("image/webp", ".webp"), ("image/webp", ".webp"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
RasterisedDocumentParser,
)
def test_text_parser(self):
"""
GIVEN:
- Various mime types of a text form
WHEN:
- The parser class is instantiated
THEN:
- The text based parser is return
"""
supported_mimes_and_exts = [
("text/plain", ".txt"), ("text/plain", ".txt"),
("text/csv", ".csv"), ("text/csv", ".csv"),
] ]
@ -141,23 +173,55 @@ class TestParserAvailability(TestCase):
for mime_type, ext in supported_mimes_and_exts: for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts) self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext) self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
TextDocumentParser,
)
def test_tika_parser(self):
"""
GIVEN:
- Various mime types of a office document form
WHEN:
- The parser class is instantiated
THEN:
- The Tika/Gotenberg based parser is return
"""
supported_mimes_and_exts = [
("application/vnd.oasis.opendocument.text", ".odt"),
("text/rtf", ".rtf"),
("application/msword", ".doc"),
(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".docx",
),
]
# Force the app ready to notice the settings override
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
app = apps.get_app_config("paperless_tika")
app.ready()
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
TikaDocumentParser,
)
def test_no_parser_for_mime(self):
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
def test_default_extension(self):
# Test no parser declared still returns a an extension # Test no parser declared still returns a an extension
self.assertEqual(get_default_file_extension("application/zip"), ".zip") self.assertEqual(get_default_file_extension("application/zip"), ".zip")
# Test invalid mimetype returns no extension # Test invalid mimetype returns no extension
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "") self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
self.assertIsInstance( def test_file_extension_support(self):
get_parser_class_for_mime_type("application/pdf")(logging_group=None),
RasterisedDocumentParser,
)
self.assertIsInstance(
get_parser_class_for_mime_type("text/plain")(logging_group=None),
TextDocumentParser,
)
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
self.assertTrue(is_file_ext_supported(".pdf")) self.assertTrue(is_file_ext_supported(".pdf"))
self.assertFalse(is_file_ext_supported(".hsdfh")) self.assertFalse(is_file_ext_supported(".hsdfh"))
self.assertFalse(is_file_ext_supported("")) self.assertFalse(is_file_ext_supported(""))

View File

@ -109,6 +109,16 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:
def _parse_beat_schedule() -> Dict: def _parse_beat_schedule() -> Dict:
"""
Configures the scheduled tasks, according to default or
environment variables. Task expiration is configured so the task will
expire (and not run), shortly before the default frequency will put another
of the same task into the queue
https://docs.celeryq.dev/en/stable/userguide/periodic-tasks.html#beat-entries
https://docs.celeryq.dev/en/latest/userguide/calling.html#expiration
"""
schedule = {} schedule = {}
tasks = [ tasks = [
{ {
@ -117,6 +127,11 @@ def _parse_beat_schedule() -> Dict:
# Default every ten minutes # Default every ten minutes
"env_default": "*/10 * * * *", "env_default": "*/10 * * * *",
"task": "paperless_mail.tasks.process_mail_accounts", "task": "paperless_mail.tasks.process_mail_accounts",
"options": {
# 1 minute before default schedule sends again
"expires": 9.0
* 60.0,
},
}, },
{ {
"name": "Train the classifier", "name": "Train the classifier",
@ -124,6 +139,11 @@ def _parse_beat_schedule() -> Dict:
# Default hourly at 5 minutes past the hour # Default hourly at 5 minutes past the hour
"env_default": "5 */1 * * *", "env_default": "5 */1 * * *",
"task": "documents.tasks.train_classifier", "task": "documents.tasks.train_classifier",
"options": {
# 1 minute before default schedule sends again
"expires": 59.0
* 60.0,
},
}, },
{ {
"name": "Optimize the index", "name": "Optimize the index",
@ -131,6 +151,12 @@ def _parse_beat_schedule() -> Dict:
# Default daily at midnight # Default daily at midnight
"env_default": "0 0 * * *", "env_default": "0 0 * * *",
"task": "documents.tasks.index_optimize", "task": "documents.tasks.index_optimize",
"options": {
# 1 hour before default schedule sends again
"expires": 23.0
* 60.0
* 60.0,
},
}, },
{ {
"name": "Perform sanity check", "name": "Perform sanity check",
@ -138,6 +164,12 @@ def _parse_beat_schedule() -> Dict:
# Default Sunday at 00:30 # Default Sunday at 00:30
"env_default": "30 0 * * sun", "env_default": "30 0 * * sun",
"task": "documents.tasks.sanity_check", "task": "documents.tasks.sanity_check",
"options": {
# 1 hour before default schedule sends again
"expires": ((7.0 * 24.0) - 1.0)
* 60.0
* 60.0,
},
}, },
] ]
for task in tasks: for task in tasks:
@ -151,9 +183,11 @@ def _parse_beat_schedule() -> Dict:
# - five time-and-date fields # - five time-and-date fields
# - separated by at least one blank # - separated by at least one blank
minute, hour, day_month, month, day_week = value.split(" ") minute, hour, day_month, month, day_week = value.split(" ")
schedule[task["name"]] = { schedule[task["name"]] = {
"task": task["task"], "task": task["task"],
"schedule": crontab(minute, hour, day_week, day_month, month), "schedule": crontab(minute, hour, day_week, day_month, month),
"options": task["options"],
} }
return schedule return schedule
@ -564,22 +598,21 @@ LOGGING = {
# Task queue # # Task queue #
############################################################################### ###############################################################################
TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1) # https://docs.celeryq.dev/en/stable/userguide/configuration.html
WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
CELERY_BROKER_URL = _CELERY_REDIS_URL CELERY_BROKER_URL = _CELERY_REDIS_URL
CELERY_TIMEZONE = TIME_ZONE CELERY_TIMEZONE = TIME_ZONE
CELERY_WORKER_HIJACK_ROOT_LOGGER = False CELERY_WORKER_HIJACK_ROOT_LOGGER = False
CELERY_WORKER_CONCURRENCY = TASK_WORKERS CELERY_WORKER_CONCURRENCY: Final[int] = __get_int("PAPERLESS_TASK_WORKERS", 1)
TASK_WORKERS = CELERY_WORKER_CONCURRENCY
CELERY_WORKER_MAX_TASKS_PER_CHILD = 1 CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
CELERY_WORKER_SEND_TASK_EVENTS = True CELERY_WORKER_SEND_TASK_EVENTS = True
CELERY_TASK_SEND_SENT_EVENT = True
CELERY_SEND_TASK_SENT_EVENT = True CELERY_SEND_TASK_SENT_EVENT = True
CELERY_TASK_TRACK_STARTED = True CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT CELERY_TASK_TIME_LIMIT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
CELERY_RESULT_EXTENDED = True CELERY_RESULT_EXTENDED = True
CELERY_RESULT_BACKEND = "django-db" CELERY_RESULT_BACKEND = "django-db"
@ -611,7 +644,7 @@ def default_threads_per_worker(task_workers) -> int:
THREADS_PER_WORKER = os.getenv( THREADS_PER_WORKER = os.getenv(
"PAPERLESS_THREADS_PER_WORKER", "PAPERLESS_THREADS_PER_WORKER",
default_threads_per_worker(TASK_WORKERS), default_threads_per_worker(CELERY_WORKER_CONCURRENCY),
) )
############################################################################### ###############################################################################

View File

@ -149,6 +149,11 @@ class TestRedisSocketConversion(TestCase):
class TestCeleryScheduleParsing(TestCase): class TestCeleryScheduleParsing(TestCase):
MAIL_EXPIRE_TIME = 9.0 * 60.0
CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0
INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
def test_schedule_configuration_default(self): def test_schedule_configuration_default(self):
""" """
GIVEN: GIVEN:
@ -165,18 +170,22 @@ class TestCeleryScheduleParsing(TestCase):
"Check all e-mail accounts": { "Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts", "task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"), "schedule": crontab(minute="*/10"),
"options": {"expires": self.MAIL_EXPIRE_TIME},
}, },
"Train the classifier": { "Train the classifier": {
"task": "documents.tasks.train_classifier", "task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"), "schedule": crontab(minute="5", hour="*/1"),
"options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
}, },
"Optimize the index": { "Optimize the index": {
"task": "documents.tasks.index_optimize", "task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0), "schedule": crontab(minute=0, hour=0),
"options": {"expires": self.INDEX_EXPIRE_TIME},
}, },
"Perform sanity check": { "Perform sanity check": {
"task": "documents.tasks.sanity_check", "task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"), "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
"options": {"expires": self.SANITY_EXPIRE_TIME},
}, },
}, },
schedule, schedule,
@ -203,18 +212,22 @@ class TestCeleryScheduleParsing(TestCase):
"Check all e-mail accounts": { "Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts", "task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/50", day_of_week="mon"), "schedule": crontab(minute="*/50", day_of_week="mon"),
"options": {"expires": self.MAIL_EXPIRE_TIME},
}, },
"Train the classifier": { "Train the classifier": {
"task": "documents.tasks.train_classifier", "task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"), "schedule": crontab(minute="5", hour="*/1"),
"options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
}, },
"Optimize the index": { "Optimize the index": {
"task": "documents.tasks.index_optimize", "task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0), "schedule": crontab(minute=0, hour=0),
"options": {"expires": self.INDEX_EXPIRE_TIME},
}, },
"Perform sanity check": { "Perform sanity check": {
"task": "documents.tasks.sanity_check", "task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"), "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
"options": {"expires": self.SANITY_EXPIRE_TIME},
}, },
}, },
schedule, schedule,
@ -238,14 +251,17 @@ class TestCeleryScheduleParsing(TestCase):
"Check all e-mail accounts": { "Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts", "task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"), "schedule": crontab(minute="*/10"),
"options": {"expires": self.MAIL_EXPIRE_TIME},
}, },
"Train the classifier": { "Train the classifier": {
"task": "documents.tasks.train_classifier", "task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"), "schedule": crontab(minute="5", hour="*/1"),
"options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
}, },
"Perform sanity check": { "Perform sanity check": {
"task": "documents.tasks.sanity_check", "task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"), "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
"options": {"expires": self.SANITY_EXPIRE_TIME},
}, },
}, },
schedule, schedule,

View File

@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = {
} }
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
class TestWebSockets(TestCase): class TestWebSockets(TestCase):
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
async def test_no_auth(self): async def test_no_auth(self):
communicator = WebsocketCommunicator(application, "/ws/status/") communicator = WebsocketCommunicator(application, "/ws/status/")
connected, subprotocol = await communicator.connect() connected, subprotocol = await communicator.connect()
self.assertFalse(connected) self.assertFalse(connected)
await communicator.disconnect() await communicator.disconnect()
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
@mock.patch("paperless.consumers.StatusConsumer._authenticated") @mock.patch("paperless.consumers.StatusConsumer._authenticated")
async def test_auth(self, _authenticated): async def test_auth(self, _authenticated):
_authenticated.return_value = True _authenticated.return_value = True
@ -33,7 +32,6 @@ class TestWebSockets(TestCase):
await communicator.disconnect() await communicator.disconnect()
@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
@mock.patch("paperless.consumers.StatusConsumer._authenticated") @mock.patch("paperless.consumers.StatusConsumer._authenticated")
async def test_receive(self, _authenticated): async def test_receive(self, _authenticated):
_authenticated.return_value = True _authenticated.return_value = True

View File

@ -24,7 +24,7 @@ class StandardPagination(PageNumberPagination):
class FaviconView(View): class FaviconView(View):
def get(self, request, *args, **kwargs): def get(self, request, *args, **kwargs): # pragma: nocover
favicon = os.path.join( favicon = os.path.join(
os.path.dirname(__file__), os.path.dirname(__file__),
"static", "static",

View File

@ -2,12 +2,13 @@ from django.contrib.auth.models import User
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import Tag from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
from paperless_mail.models import MailAccount from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule from paperless_mail.models import MailRule
from rest_framework.test import APITestCase from rest_framework.test import APITestCase
class TestAPIMailAccounts(APITestCase): class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/mail_accounts/" ENDPOINT = "/api/mail_accounts/"
def setUp(self): def setUp(self):
@ -165,7 +166,7 @@ class TestAPIMailAccounts(APITestCase):
self.assertEqual(returned_account2.password, "123xyz") self.assertEqual(returned_account2.password, "123xyz")
class TestAPIMailRules(APITestCase): class TestAPIMailRules(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/mail_rules/" ENDPOINT = "/api/mail_rules/"
def setUp(self): def setUp(self):

View File

@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser):
except Exception: except Exception:
# TODO catch all for various issues with PDFminer.six. # TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR. # If pdftotext fails, fall back to OCR.
self.log( self.log(
"warning", "warning",
"Error while getting text from PDF document with " "pdfminer.six", "Error while getting text from PDF document with " "pdfminer.six",

View File

@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase):
) )
self.assertTrue(os.path.isfile(parser.archive_path)) self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
self.assertFalse("page 3" in parser.get_text().lower()) self.assertNotIn("page 3", parser.get_text().lower())
@override_settings(OCR_PAGES=1, OCR_MODE="force") @override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self): def test_multi_page_analog_pages_force(self):
@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase):
) )
self.assertTrue(os.path.isfile(parser.archive_path)) self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
self.assertFalse("page 2" in parser.get_text().lower()) self.assertNotIn("page 2", parser.get_text().lower())
self.assertFalse("page 3" in parser.get_text().lower()) self.assertNotIn("page 3", parser.get_text().lower())
@override_settings(OCR_MODE="skip_noarchive") @override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self): def test_skip_noarchive_withtext(self):
@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase):
params = parser.construct_ocrmypdf_parameters("", "", "", "") params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params) self.assertNotIn("deskew", params)
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertIn("max_image_mpixels", params)
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("max_image_mpixels", params)
def test_rtl_language_detection(self): def test_rtl_language_detection(self):
""" """
GIVEN: GIVEN:

View File

@ -3,7 +3,9 @@ import os
from pathlib import Path from pathlib import Path
from unittest import mock from unittest import mock
from django.test import override_settings
from django.test import TestCase from django.test import TestCase
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser from paperless_tika.parsers import TikaDocumentParser
from requests import Response from requests import Response
@ -54,3 +56,63 @@ class TestTikaParser(TestCase):
self.assertTrue("Creation-Date" in [m["key"] for m in metadata]) self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
self.assertTrue("Some-key" in [m["key"] for m in metadata]) self.assertTrue("Some-key" in [m["key"] for m in metadata])
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_convert_failure(self, post, from_file):
"""
GIVEN:
- Document needs to be converted to PDF
WHEN:
- Gotenberg server returns an error
THEN:
- Parse error is raised
"""
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = 500
post.return_value = response
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
with self.assertRaises(ParseError):
self.parser.convert_to_pdf(file, None)
@mock.patch("paperless_tika.parsers.requests.post")
def test_request_pdf_a_format(self, post: mock.Mock):
"""
GIVEN:
- Document needs to be converted to PDF
WHEN:
- Specific PDF/A format requested
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
response = Response()
response._content = b"PDF document"
response.status_code = 200
post.return_value = response
for setting, expected_key in [
("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-1", "PDF/A-1a"),
("pdfa-3", "PDF/A-3b"),
]:
with override_settings(OCR_OUTPUT_TYPE=setting):
self.parser.convert_to_pdf(file, None)
post.assert_called_once()
_, kwargs = post.call_args
self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
post.reset_mock()