From bdcba570cb3c70391b4e05aac656864d2e8a809b Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 2 Feb 2023 12:46:49 -0800 Subject: [PATCH 01/16] Adding more test coverage, in particular around Tika and its parser --- src/documents/tests/test_api.py | 14 ++-- src/documents/tests/test_importer.py | 7 +- src/documents/tests/test_parsers.py | 88 +++++++++++++++++--- src/paperless/tests/test_websockets.py | 4 +- src/paperless/views.py | 2 +- src/paperless_tesseract/parsers.py | 2 +- src/paperless_tesseract/tests/test_parser.py | 15 +++- src/paperless_tika/tests/test_tika_parser.py | 62 ++++++++++++++ 8 files changed, 164 insertions(+), 30 deletions(-) diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index b6d817de1..1b8a71ded 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -121,28 +121,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): response = self.client.get("/api/documents/", format="json") self.assertEqual(response.status_code, 200) results_full = response.data["results"] - self.assertTrue("content" in results_full[0]) - self.assertTrue("id" in results_full[0]) + self.assertIn("content", results_full[0]) + self.assertIn("id", results_full[0]) response = self.client.get("/api/documents/?fields=id", format="json") self.assertEqual(response.status_code, 200) results = response.data["results"] self.assertFalse("content" in results[0]) - self.assertTrue("id" in results[0]) + self.assertIn("id", results[0]) self.assertEqual(len(results[0]), 1) response = self.client.get("/api/documents/?fields=content", format="json") self.assertEqual(response.status_code, 200) results = response.data["results"] - self.assertTrue("content" in results[0]) + self.assertIn("content", results[0]) self.assertFalse("id" in results[0]) self.assertEqual(len(results[0]), 1) response = self.client.get("/api/documents/?fields=id,content", format="json") self.assertEqual(response.status_code, 200) results = response.data["results"] - self.assertTrue("content" in results[0]) - self.assertTrue("id" in results[0]) + self.assertIn("content", results[0]) + self.assertIn("id", results[0]) self.assertEqual(len(results[0]), 2) response = self.client.get( @@ -152,7 +152,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 200) results = response.data["results"] self.assertFalse("content" in results[0]) - self.assertTrue("id" in results[0]) + self.assertIn("id", results[0]) self.assertEqual(len(results[0]), 1) response = self.client.get("/api/documents/?fields=", format="json") diff --git a/src/documents/tests/test_importer.py b/src/documents/tests/test_importer.py index 5101a269f..10146ff30 100644 --- a/src/documents/tests/test_importer.py +++ b/src/documents/tests/test_importer.py @@ -25,7 +25,7 @@ class TestImporter(TestCase): cmd.manifest = [{"model": "documents.document"}] with self.assertRaises(CommandError) as cm: cmd._check_manifest() - self.assertTrue("The manifest file contains a record" in str(cm.exception)) + self.assertIn("The manifest file contains a record", str(cm.exception)) cmd.manifest = [ {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"}, @@ -33,6 +33,7 @@ class TestImporter(TestCase): # self.assertRaises(CommandError, cmd._check_manifest) with self.assertRaises(CommandError) as cm: cmd._check_manifest() - self.assertTrue( - 'The manifest file refers to "noexist.pdf"' in str(cm.exception), + self.assertIn( + 'The manifest file refers to "noexist.pdf"', + str(cm.exception), ) diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 8ba2c70ee..eda4bacf8 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,6 +1,8 @@ from tempfile import TemporaryDirectory from unittest import mock +from django.apps import apps +from django.test import override_settings from django.test import TestCase from documents.parsers import get_default_file_extension from documents.parsers import get_parser_class_for_mime_type @@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_text.parsers import TextDocumentParser +from paperless_tika.parsers import TikaDocumentParser class TestParserDiscovery(TestCase): @@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase): class TestParserAvailability(TestCase): - def test_file_extensions(self): - + def test_tesseract_parser(self): + """ + GIVEN: + - Various mime types + WHEN: + - The parser class is instantiated + THEN: + - The Tesseract based parser is return + """ supported_mimes_and_exts = [ ("application/pdf", ".pdf"), ("image/png", ".png"), ("image/jpeg", ".jpg"), ("image/tiff", ".tif"), ("image/webp", ".webp"), + ] + + supported_exts = get_supported_file_extensions() + + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(logging_group=None), + RasterisedDocumentParser, + ) + + def test_text_parser(self): + """ + GIVEN: + - Various mime types of a text form + WHEN: + - The parser class is instantiated + THEN: + - The text based parser is return + """ + supported_mimes_and_exts = [ ("text/plain", ".txt"), ("text/csv", ".csv"), ] @@ -141,23 +173,55 @@ class TestParserAvailability(TestCase): for mime_type, ext in supported_mimes_and_exts: self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(logging_group=None), + TextDocumentParser, + ) + def test_tika_parser(self): + """ + GIVEN: + - Various mime types of a office document form + WHEN: + - The parser class is instantiated + THEN: + - The Tika/Gotenberg based parser is return + """ + supported_mimes_and_exts = [ + ("application/vnd.oasis.opendocument.text", ".odt"), + ("text/rtf", ".rtf"), + ("application/msword", ".doc"), + ( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".docx", + ), + ] + + # Force the app ready to notice the settings override + with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]): + app = apps.get_app_config("paperless_tika") + app.ready() + supported_exts = get_supported_file_extensions() + + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(logging_group=None), + TikaDocumentParser, + ) + + def test_no_parser_for_mime(self): + self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) + + def test_default_extension(self): # Test no parser declared still returns a an extension self.assertEqual(get_default_file_extension("application/zip"), ".zip") # Test invalid mimetype returns no extension self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "") - self.assertIsInstance( - get_parser_class_for_mime_type("application/pdf")(logging_group=None), - RasterisedDocumentParser, - ) - self.assertIsInstance( - get_parser_class_for_mime_type("text/plain")(logging_group=None), - TextDocumentParser, - ) - self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) - + def test_file_extension_support(self): self.assertTrue(is_file_ext_supported(".pdf")) self.assertFalse(is_file_ext_supported(".hsdfh")) self.assertFalse(is_file_ext_supported("")) diff --git a/src/paperless/tests/test_websockets.py b/src/paperless/tests/test_websockets.py index 069bb644a..cebbddf39 100644 --- a/src/paperless/tests/test_websockets.py +++ b/src/paperless/tests/test_websockets.py @@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = { } +@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) class TestWebSockets(TestCase): - @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) async def test_no_auth(self): communicator = WebsocketCommunicator(application, "/ws/status/") connected, subprotocol = await communicator.connect() self.assertFalse(connected) await communicator.disconnect() - @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) @mock.patch("paperless.consumers.StatusConsumer._authenticated") async def test_auth(self, _authenticated): _authenticated.return_value = True @@ -33,7 +32,6 @@ class TestWebSockets(TestCase): await communicator.disconnect() - @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) @mock.patch("paperless.consumers.StatusConsumer._authenticated") async def test_receive(self, _authenticated): _authenticated.return_value = True diff --git a/src/paperless/views.py b/src/paperless/views.py index 9f3d017a6..975df6601 100644 --- a/src/paperless/views.py +++ b/src/paperless/views.py @@ -12,7 +12,7 @@ class StandardPagination(PageNumberPagination): class FaviconView(View): - def get(self, request, *args, **kwargs): + def get(self, request, *args, **kwargs): # pragma: nocover favicon = os.path.join( os.path.dirname(__file__), "static", diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 14068cb26..4227583f8 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser): except Exception: # TODO catch all for various issues with PDFminer.six. - # If PDFminer fails, fall back to OCR. + # If pdftotext fails, fall back to OCR. self.log( "warning", "Error while getting text from PDF document with " "pdfminer.six", diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 7fa399c97..d22ce26a7 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase): ) self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) - self.assertFalse("page 3" in parser.get_text().lower()) + self.assertNotIn("page 3", parser.get_text().lower()) @override_settings(OCR_PAGES=1, OCR_MODE="force") def test_multi_page_analog_pages_force(self): @@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase): ) self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) - self.assertFalse("page 2" in parser.get_text().lower()) - self.assertFalse("page 3" in parser.get_text().lower()) + self.assertNotIn("page 2", parser.get_text().lower()) + self.assertNotIn("page 3", parser.get_text().lower()) @override_settings(OCR_MODE="skip_noarchive") def test_skip_noarchive_withtext(self): @@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("deskew", params) + with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0): + params = parser.construct_ocrmypdf_parameters("", "", "", "") + self.assertIn("max_image_mpixels", params) + self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4) + + with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0): + params = parser.construct_ocrmypdf_parameters("", "", "", "") + self.assertNotIn("max_image_mpixels", params) + def test_rtl_language_detection(self): """ GIVEN: diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index bf6b4e7c8..058196581 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -3,7 +3,9 @@ import os from pathlib import Path from unittest import mock +from django.test import override_settings from django.test import TestCase +from documents.parsers import ParseError from paperless_tika.parsers import TikaDocumentParser from requests import Response @@ -54,3 +56,63 @@ class TestTikaParser(TestCase): self.assertTrue("Creation-Date" in [m["key"] for m in metadata]) self.assertTrue("Some-key" in [m["key"] for m in metadata]) + + @mock.patch("paperless_tika.parsers.parser.from_file") + @mock.patch("paperless_tika.parsers.requests.post") + def test_convert_failure(self, post, from_file): + """ + GIVEN: + - Document needs to be converted to PDF + WHEN: + - Gotenberg server returns an error + THEN: + - Parse error is raised + """ + from_file.return_value = { + "content": "the content", + "metadata": {"Creation-Date": "2020-11-21"}, + } + response = Response() + response._content = b"PDF document" + response.status_code = 500 + post.return_value = response + + file = os.path.join(self.parser.tempdir, "input.odt") + Path(file).touch() + + with self.assertRaises(ParseError): + self.parser.convert_to_pdf(file, None) + + @mock.patch("paperless_tika.parsers.requests.post") + def test_request_pdf_a_format(self, post: mock.Mock): + """ + GIVEN: + - Document needs to be converted to PDF + WHEN: + - Specific PDF/A format requested + THEN: + - Request to Gotenberg contains the expected PDF/A format string + """ + file = os.path.join(self.parser.tempdir, "input.odt") + Path(file).touch() + + response = Response() + response._content = b"PDF document" + response.status_code = 200 + post.return_value = response + + for setting, expected_key in [ + ("pdfa", "PDF/A-2b"), + ("pdfa-2", "PDF/A-2b"), + ("pdfa-1", "PDF/A-1a"), + ("pdfa-3", "PDF/A-3b"), + ]: + with override_settings(OCR_OUTPUT_TYPE=setting): + self.parser.convert_to_pdf(file, None) + + post.assert_called_once() + _, kwargs = post.call_args + + self.assertEqual(kwargs["data"]["pdfFormat"], expected_key) + + post.reset_mock() From 8ef5f0e93c02f463cd7cd2d1a138531401a536d9 Mon Sep 17 00:00:00 2001 From: Bastien KERVICHE Date: Mon, 6 Feb 2023 11:21:10 +0100 Subject: [PATCH 02/16] fix: limit pagination in management-list --- .../manage/management-list/management-list.component.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src-ui/src/app/components/manage/management-list/management-list.component.html b/src-ui/src/app/components/manage/management-list/management-list.component.html index d35bc0853..49baf8aaf 100644 --- a/src-ui/src/app/components/manage/management-list/management-list.component.html +++ b/src-ui/src/app/components/manage/management-list/management-list.component.html @@ -10,7 +10,7 @@ - + @@ -72,5 +72,5 @@
{collectionSize, plural, =1 {One {{typeName}}} other {{{collectionSize || 0}} total {{typeNamePlural}}}}
- +
From 66333caebc13f79a44ec73f535df83847d3242fd Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 6 Feb 2023 09:51:14 -0800 Subject: [PATCH 03/16] Fixes all Python versions uploading to Codecov --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74229ef60..f8da8c7b8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -161,7 +161,7 @@ jobs: pipenv --python ${{ steps.setup-python.outputs.python-version }} run pytest -ra - name: Upload coverage to Codecov - if: matrix.python-version == ${{ env.DEFAULT_PYTHON_VERSION }} + if: ${{ matrix.python-version == env.DEFAULT_PYTHON_VERSION }} uses: codecov/codecov-action@v3 with: # not required for public repos, but intermittently fails otherwise From 74b729bf5a08c0a2caf1d5d343a636a09a55b4b4 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue, 7 Feb 2023 15:44:15 -0800 Subject: [PATCH 04/16] tags dropdown doesnt need to open upon removal --- .../app/components/common/input/tags/tags.component.html | 2 +- .../src/app/components/common/input/tags/tags.component.ts | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src-ui/src/app/components/common/input/tags/tags.component.html b/src-ui/src/app/components/common/input/tags/tags.component.html index 14de0f98a..4cd8a6132 100644 --- a/src-ui/src/app/components/common/input/tags/tags.component.html +++ b/src-ui/src/app/components/common/input/tags/tags.component.html @@ -17,7 +17,7 @@ (blur)="onBlur()"> - + diff --git a/src-ui/src/app/components/common/input/tags/tags.component.ts b/src-ui/src/app/components/common/input/tags/tags.component.ts index 0a4ed6fb2..6c5ea887f 100644 --- a/src-ui/src/app/components/common/input/tags/tags.component.ts +++ b/src-ui/src/app/components/common/input/tags/tags.component.ts @@ -65,7 +65,7 @@ export class TagsComponent implements OnInit, ControlValueAccessor { private _lastSearchTerm: string - getTag(id) { + getTag(id: number) { if (this.tags) { return this.tags.find((tag) => tag.id == id) } else { @@ -73,7 +73,10 @@ export class TagsComponent implements OnInit, ControlValueAccessor { } } - removeTag(id) { + removeTag(event: PointerEvent, id: number) { + // prevent opening dropdown + event.stopImmediatePropagation() + let index = this.value.indexOf(id) if (index > -1) { let oldValue = this.value From b9f0418038c8a10bf3fc63af4c4351a682207ae7 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 3 Feb 2023 14:19:22 -0800 Subject: [PATCH 05/16] Fixes flower not respecting its config location, and a little more info to the user --- docker/flower-conditional.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/flower-conditional.sh b/docker/flower-conditional.sh index 04319a8e3..f8719e0fd 100644 --- a/docker/flower-conditional.sh +++ b/docker/flower-conditional.sh @@ -3,5 +3,10 @@ echo "Checking if we should start flower..." if [[ -n "${PAPERLESS_ENABLE_FLOWER}" ]]; then - celery --app paperless flower + # Small delay to allow celery to be up first + echo "Starting flower in 5s" + sleep 5 + celery --app paperless flower --conf=/usr/src/paperless/src/paperless/flowerconfig.py +else + echo "Not starting flower" fi From 7af0b47ba97ecdefb583af735e4b0e0caeeed57d Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 3 Feb 2023 14:41:10 -0800 Subject: [PATCH 06/16] Expire the scheduled tasks shortly a new one will be added to the queue by default --- src/paperless/settings.py | 48 ++++++++++++++++++++++++---- src/paperless/tests/test_settings.py | 16 ++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index cf119ea8a..fcd933cfd 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -109,6 +109,16 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]: def _parse_beat_schedule() -> Dict: + """ + Configures the scheduled tasks, according to default or + environment variables. Task expiration is configured so the task will + expire (and not run), shortly before the default frequency will put another + of the same task into the queue + + + https://docs.celeryq.dev/en/stable/userguide/periodic-tasks.html#beat-entries + https://docs.celeryq.dev/en/latest/userguide/calling.html#expiration + """ schedule = {} tasks = [ { @@ -117,6 +127,11 @@ def _parse_beat_schedule() -> Dict: # Default every ten minutes "env_default": "*/10 * * * *", "task": "paperless_mail.tasks.process_mail_accounts", + "options": { + # 1 minute before default schedule sends again + "expires": 9.0 + * 60.0, + }, }, { "name": "Train the classifier", @@ -124,6 +139,11 @@ def _parse_beat_schedule() -> Dict: # Default hourly at 5 minutes past the hour "env_default": "5 */1 * * *", "task": "documents.tasks.train_classifier", + "options": { + # 1 minute before default schedule sends again + "expires": 59.0 + * 60.0, + }, }, { "name": "Optimize the index", @@ -131,6 +151,12 @@ def _parse_beat_schedule() -> Dict: # Default daily at midnight "env_default": "0 0 * * *", "task": "documents.tasks.index_optimize", + "options": { + # 1 hour before default schedule sends again + "expires": 23.0 + * 60.0 + * 60.0, + }, }, { "name": "Perform sanity check", @@ -138,6 +164,13 @@ def _parse_beat_schedule() -> Dict: # Default Sunday at 00:30 "env_default": "30 0 * * sun", "task": "documents.tasks.sanity_check", + "options": { + # 1 hour before default schedule sends again + "expires": 7.0 + * 23.0 + * 60.0 + * 60.0, + }, }, ] for task in tasks: @@ -151,9 +184,11 @@ def _parse_beat_schedule() -> Dict: # - five time-and-date fields # - separated by at least one blank minute, hour, day_month, month, day_week = value.split(" ") + schedule[task["name"]] = { "task": task["task"], "schedule": crontab(minute, hour, day_week, day_month, month), + "options": task["options"], } return schedule @@ -561,22 +596,21 @@ LOGGING = { # Task queue # ############################################################################### -TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1) - -WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800) +# https://docs.celeryq.dev/en/stable/userguide/configuration.html CELERY_BROKER_URL = _CELERY_REDIS_URL CELERY_TIMEZONE = TIME_ZONE CELERY_WORKER_HIJACK_ROOT_LOGGER = False -CELERY_WORKER_CONCURRENCY = TASK_WORKERS +CELERY_WORKER_CONCURRENCY: Final[int] = __get_int("PAPERLESS_TASK_WORKERS", 1) +TASK_WORKERS = CELERY_WORKER_CONCURRENCY CELERY_WORKER_MAX_TASKS_PER_CHILD = 1 CELERY_WORKER_SEND_TASK_EVENTS = True - +CELERY_TASK_SEND_SENT_EVENT = True CELERY_SEND_TASK_SENT_EVENT = True CELERY_TASK_TRACK_STARTED = True -CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT +CELERY_TASK_TIME_LIMIT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800) CELERY_RESULT_EXTENDED = True CELERY_RESULT_BACKEND = "django-db" @@ -608,7 +642,7 @@ def default_threads_per_worker(task_workers) -> int: THREADS_PER_WORKER = os.getenv( "PAPERLESS_THREADS_PER_WORKER", - default_threads_per_worker(TASK_WORKERS), + default_threads_per_worker(CELERY_WORKER_CONCURRENCY), ) ############################################################################### diff --git a/src/paperless/tests/test_settings.py b/src/paperless/tests/test_settings.py index f6d25f6fd..a85f0e06a 100644 --- a/src/paperless/tests/test_settings.py +++ b/src/paperless/tests/test_settings.py @@ -149,6 +149,11 @@ class TestRedisSocketConversion(TestCase): class TestCeleryScheduleParsing(TestCase): + MAIL_EXPIRE_TIME = 9.0 * 60.0 + CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0 + INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0 + SANITY_EXPIRE_TIME = 7.0 * 23.0 * 60.0 * 60.0 + def test_schedule_configuration_default(self): """ GIVEN: @@ -165,18 +170,22 @@ class TestCeleryScheduleParsing(TestCase): "Check all e-mail accounts": { "task": "paperless_mail.tasks.process_mail_accounts", "schedule": crontab(minute="*/10"), + "options": {"expires": self.MAIL_EXPIRE_TIME}, }, "Train the classifier": { "task": "documents.tasks.train_classifier", "schedule": crontab(minute="5", hour="*/1"), + "options": {"expires": self.CLASSIFIER_EXPIRE_TIME}, }, "Optimize the index": { "task": "documents.tasks.index_optimize", "schedule": crontab(minute=0, hour=0), + "options": {"expires": self.INDEX_EXPIRE_TIME}, }, "Perform sanity check": { "task": "documents.tasks.sanity_check", "schedule": crontab(minute=30, hour=0, day_of_week="sun"), + "options": {"expires": self.SANITY_EXPIRE_TIME}, }, }, schedule, @@ -203,18 +212,22 @@ class TestCeleryScheduleParsing(TestCase): "Check all e-mail accounts": { "task": "paperless_mail.tasks.process_mail_accounts", "schedule": crontab(minute="*/50", day_of_week="mon"), + "options": {"expires": self.MAIL_EXPIRE_TIME}, }, "Train the classifier": { "task": "documents.tasks.train_classifier", "schedule": crontab(minute="5", hour="*/1"), + "options": {"expires": self.CLASSIFIER_EXPIRE_TIME}, }, "Optimize the index": { "task": "documents.tasks.index_optimize", "schedule": crontab(minute=0, hour=0), + "options": {"expires": self.INDEX_EXPIRE_TIME}, }, "Perform sanity check": { "task": "documents.tasks.sanity_check", "schedule": crontab(minute=30, hour=0, day_of_week="sun"), + "options": {"expires": self.SANITY_EXPIRE_TIME}, }, }, schedule, @@ -238,14 +251,17 @@ class TestCeleryScheduleParsing(TestCase): "Check all e-mail accounts": { "task": "paperless_mail.tasks.process_mail_accounts", "schedule": crontab(minute="*/10"), + "options": {"expires": self.MAIL_EXPIRE_TIME}, }, "Train the classifier": { "task": "documents.tasks.train_classifier", "schedule": crontab(minute="5", hour="*/1"), + "options": {"expires": self.CLASSIFIER_EXPIRE_TIME}, }, "Perform sanity check": { "task": "documents.tasks.sanity_check", "schedule": crontab(minute=30, hour=0, day_of_week="sun"), + "options": {"expires": self.SANITY_EXPIRE_TIME}, }, }, schedule, From 8181535f40591063eb0d3d2e56f6f9268e701012 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 8 Feb 2023 09:33:46 -0800 Subject: [PATCH 07/16] Fixes expiration of the weekly task --- src/paperless/settings.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index fcd933cfd..fa0f76a91 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -166,8 +166,7 @@ def _parse_beat_schedule() -> Dict: "task": "documents.tasks.sanity_check", "options": { # 1 hour before default schedule sends again - "expires": 7.0 - * 23.0 + "expires": ((7.0 * 24.0) - 1.0) * 60.0 * 60.0, }, From 40db244d4a84d1bde4d4c78bc4cbdef5303b77d3 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 8 Feb 2023 09:47:56 -0800 Subject: [PATCH 08/16] Fixes the test for expiration --- src/paperless/tests/test_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paperless/tests/test_settings.py b/src/paperless/tests/test_settings.py index a85f0e06a..44ca46ec4 100644 --- a/src/paperless/tests/test_settings.py +++ b/src/paperless/tests/test_settings.py @@ -152,7 +152,7 @@ class TestCeleryScheduleParsing(TestCase): MAIL_EXPIRE_TIME = 9.0 * 60.0 CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0 INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0 - SANITY_EXPIRE_TIME = 7.0 * 23.0 * 60.0 * 60.0 + SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0 def test_schedule_configuration_default(self): """ From d73fbb1643bbe3f205f2d07597333ae3f21e4699 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Thu, 9 Feb 2023 20:16:45 -0800 Subject: [PATCH 09/16] Fix long dropdown contents break column layout --- .../components/document-detail/document-detail.component.html | 2 +- .../components/document-detail/document-detail.component.scss | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index b79e56ca0..d824f6921 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -63,7 +63,7 @@
-
+
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.scss b/src-ui/src/app/components/document-detail/document-detail.component.scss index 71d50ca61..76b834bc9 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.scss +++ b/src-ui/src/app/components/document-detail/document-detail.component.scss @@ -22,6 +22,10 @@ --page-margin: 1px 0 20px; } +::ng-deep .ng-select-taggable { + max-width: calc(100% - 46px); // fudge factor for ng-select button width +} + .password-prompt { position: absolute; top: 30%; From ced248ad49e8006a36ef2b8072fee5d921fa5323 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Thu, 9 Feb 2023 20:17:00 -0800 Subject: [PATCH 10/16] Fix toggle split doesnt have rounded right corner --- .../document-detail/document-detail.component.scss | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src-ui/src/app/components/document-detail/document-detail.component.scss b/src-ui/src/app/components/document-detail/document-detail.component.scss index 76b834bc9..23109950c 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.scss +++ b/src-ui/src/app/components/document-detail/document-detail.component.scss @@ -26,6 +26,11 @@ max-width: calc(100% - 46px); // fudge factor for ng-select button width } +.btn-group .dropdown-toggle-split { + border-top-right-radius: inherit; + border-bottom-right-radius: inherit; +} + .password-prompt { position: absolute; top: 30%; From 37ddc3b8f757147d817fc398d3a47de3d73f8f95 Mon Sep 17 00:00:00 2001 From: Omar Saleem Date: Thu, 9 Feb 2023 02:25:25 -0500 Subject: [PATCH 11/16] wrapping ports in quotes --- docker/compose/docker-compose.mariadb-tika.yml | 2 +- docker/compose/docker-compose.mariadb.yml | 2 +- docker/compose/docker-compose.portainer.yml | 2 +- docker/compose/docker-compose.postgres-tika.yml | 2 +- docker/compose/docker-compose.postgres.yml | 2 +- docker/compose/docker-compose.sqlite-tika.yml | 2 +- docker/compose/docker-compose.sqlite.yml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/compose/docker-compose.mariadb-tika.yml b/docker/compose/docker-compose.mariadb-tika.yml index dc6f7a672..4cd9f71b0 100644 --- a/docker/compose/docker-compose.mariadb-tika.yml +++ b/docker/compose/docker-compose.mariadb-tika.yml @@ -59,7 +59,7 @@ services: - gotenberg - tika ports: - - 8000:8000 + - "8000:8000" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000"] interval: 30s diff --git a/docker/compose/docker-compose.mariadb.yml b/docker/compose/docker-compose.mariadb.yml index 11aab5068..e25ef78b3 100644 --- a/docker/compose/docker-compose.mariadb.yml +++ b/docker/compose/docker-compose.mariadb.yml @@ -53,7 +53,7 @@ services: - db - broker ports: - - 8000:8000 + - "8000:8000" healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000"] interval: 30s diff --git a/docker/compose/docker-compose.portainer.yml b/docker/compose/docker-compose.portainer.yml index c3720e213..3e76af73c 100644 --- a/docker/compose/docker-compose.portainer.yml +++ b/docker/compose/docker-compose.portainer.yml @@ -53,7 +53,7 @@ services: - db - broker ports: - - 8010:8000 + - "8010:8000" healthcheck: test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] interval: 30s diff --git a/docker/compose/docker-compose.postgres-tika.yml b/docker/compose/docker-compose.postgres-tika.yml index 1158e7d67..c67fb47d0 100644 --- a/docker/compose/docker-compose.postgres-tika.yml +++ b/docker/compose/docker-compose.postgres-tika.yml @@ -57,7 +57,7 @@ services: - gotenberg - tika ports: - - 8000:8000 + - "8000:8000" healthcheck: test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] interval: 30s diff --git a/docker/compose/docker-compose.postgres.yml b/docker/compose/docker-compose.postgres.yml index 5a2ab2496..1450507b0 100644 --- a/docker/compose/docker-compose.postgres.yml +++ b/docker/compose/docker-compose.postgres.yml @@ -51,7 +51,7 @@ services: - db - broker ports: - - 8000:8000 + - "8000:8000" healthcheck: test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] interval: 30s diff --git a/docker/compose/docker-compose.sqlite-tika.yml b/docker/compose/docker-compose.sqlite-tika.yml index a331c1ad1..40d6d93a8 100644 --- a/docker/compose/docker-compose.sqlite-tika.yml +++ b/docker/compose/docker-compose.sqlite-tika.yml @@ -46,7 +46,7 @@ services: - gotenberg - tika ports: - - 8000:8000 + - "8000:8000" healthcheck: test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] interval: 30s diff --git a/docker/compose/docker-compose.sqlite.yml b/docker/compose/docker-compose.sqlite.yml index 5f5b9063b..9f9e01f68 100644 --- a/docker/compose/docker-compose.sqlite.yml +++ b/docker/compose/docker-compose.sqlite.yml @@ -37,7 +37,7 @@ services: depends_on: - broker ports: - - 8000:8000 + - "8000:8000" healthcheck: test: ["CMD", "curl", "-fs", "-S", "--max-time", "2", "http://localhost:8000"] interval: 30s From 808cf93a195eaacb9b616862235220e609e7c45d Mon Sep 17 00:00:00 2001 From: Omar Saleem Date: Thu, 9 Feb 2023 02:41:47 -0500 Subject: [PATCH 12/16] need quotes in install script too --- install-paperless-ngx.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install-paperless-ngx.sh b/install-paperless-ngx.sh index bbdc6025f..e34415d18 100755 --- a/install-paperless-ngx.sh +++ b/install-paperless-ngx.sh @@ -346,7 +346,7 @@ read -r -a OCR_LANGUAGES_ARRAY <<< "${_split_langs}" fi } > docker-compose.env -sed -i "s/- 8000:8000/- $PORT:8000/g" docker-compose.yml +sed -i "s/- 8000:8000/- \"$PORT:8000\"/g" docker-compose.yml sed -i "s#- \./consume:/usr/src/paperless/consume#- $CONSUME_FOLDER:/usr/src/paperless/consume#g" docker-compose.yml From 7bfb11a7111c87380a8af3269759ff8e71d8fbca Mon Sep 17 00:00:00 2001 From: Omar Saleem Date: Thu, 9 Feb 2023 15:19:02 -0500 Subject: [PATCH 13/16] adding quotes around port 'find' field as requested --- install-paperless-ngx.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install-paperless-ngx.sh b/install-paperless-ngx.sh index e34415d18..c05a4b36c 100755 --- a/install-paperless-ngx.sh +++ b/install-paperless-ngx.sh @@ -346,7 +346,7 @@ read -r -a OCR_LANGUAGES_ARRAY <<< "${_split_langs}" fi } > docker-compose.env -sed -i "s/- 8000:8000/- \"$PORT:8000\"/g" docker-compose.yml +sed -i "s/- \"8000:8000\"/- \"$PORT:8000\"/g" docker-compose.yml sed -i "s#- \./consume:/usr/src/paperless/consume#- $CONSUME_FOLDER:/usr/src/paperless/consume#g" docker-compose.yml From 3e777f2a5bd952534c2393532683cb7af660edd6 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sun, 5 Feb 2023 17:42:35 -0800 Subject: [PATCH 14/16] Fixes up some minor warnings from test code --- src/documents/consumer.py | 2 ++ src/documents/tests/test_api.py | 2 +- src/documents/tests/test_consumer.py | 31 +++++++++++++--------------- src/paperless_mail/tests/test_api.py | 5 +++-- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 1896415b1..6848cf292 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -342,6 +342,7 @@ class Consumer(LoggingMixin): mime_type, ) if not parser_class: + tempdir.cleanup() self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") # Notify all listeners that we're going to do some work. @@ -400,6 +401,7 @@ class Consumer(LoggingMixin): except ParseError as e: document_parser.cleanup() + tempdir.cleanup() self._fail( str(e), f"Error while consuming document {self.filename}: {e}", diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 1b8a71ded..1bef9e7aa 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -3200,7 +3200,7 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase): self.assertEqual(StoragePath.objects.count(), 1) -class TestTasks(APITestCase): +class TestTasks(DirectoriesMixin, APITestCase): ENDPOINT = "/api/tasks/" ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/" diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index de368018f..8aaefa242 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -847,13 +847,11 @@ class PreConsumeTestCase(TestCase): self.assertEqual(command[0], script.name) self.assertEqual(command[1], "path-to-file") - self.assertDictContainsSubset( - { - "DOCUMENT_SOURCE_PATH": c.original_path, - "DOCUMENT_WORKING_PATH": c.path, - }, - environment, - ) + subset = { + "DOCUMENT_SOURCE_PATH": c.original_path, + "DOCUMENT_WORKING_PATH": c.path, + } + self.assertDictEqual(environment, {**environment, **subset}) @mock.patch("documents.consumer.Consumer.log") def test_script_with_output(self, mocked_log): @@ -983,16 +981,15 @@ class PostConsumeTestCase(TestCase): self.assertEqual(command[7], "my_bank") self.assertCountEqual(command[8].split(","), ["a", "b"]) - self.assertDictContainsSubset( - { - "DOCUMENT_ID": str(doc.pk), - "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/", - "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", - "DOCUMENT_CORRESPONDENT": "my_bank", - "DOCUMENT_TAGS": "a,b", - }, - environment, - ) + subset = { + "DOCUMENT_ID": str(doc.pk), + "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/", + "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", + "DOCUMENT_CORRESPONDENT": "my_bank", + "DOCUMENT_TAGS": "a,b", + } + + self.assertDictEqual(environment, {**environment, **subset}) def test_script_exit_non_zero(self): """ diff --git a/src/paperless_mail/tests/test_api.py b/src/paperless_mail/tests/test_api.py index d20ab5c9a..056c4df90 100644 --- a/src/paperless_mail/tests/test_api.py +++ b/src/paperless_mail/tests/test_api.py @@ -2,12 +2,13 @@ from django.contrib.auth.models import User from documents.models import Correspondent from documents.models import DocumentType from documents.models import Tag +from documents.tests.utils import DirectoriesMixin from paperless_mail.models import MailAccount from paperless_mail.models import MailRule from rest_framework.test import APITestCase -class TestAPIMailAccounts(APITestCase): +class TestAPIMailAccounts(DirectoriesMixin, APITestCase): ENDPOINT = "/api/mail_accounts/" def setUp(self): @@ -165,7 +166,7 @@ class TestAPIMailAccounts(APITestCase): self.assertEqual(returned_account2.password, "123xyz") -class TestAPIMailRules(APITestCase): +class TestAPIMailRules(DirectoriesMixin, APITestCase): ENDPOINT = "/api/mail_rules/" def setUp(self): From add647afe639ecbfacac86b89939b3bac9f1d201 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 12 Feb 2023 01:53:07 -0800 Subject: [PATCH 15/16] Add missing storage path placeholders --- src/documents/serialisers.py | 6 ++++++ src/documents/tests/test_api.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index a86e590ae..8282e46d6 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -599,11 +599,17 @@ class StoragePathSerializer(MatchingModelSerializer): document_type="document_type", created="created", created_year="created_year", + created_year_short="created_year_short", created_month="created_month", + created_month_name="created_month_name", + created_month_name_short="created_month_name_short", created_day="created_day", added="added", added_year="added_year", + added_year_short="added_year_short", added_month="added_month", + added_month_name="added_month_name", + added_month_name_short="added_month_name_short", added_day="added_day", asn="asn", tags="tags", diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 1bef9e7aa..38ab46b91 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -3199,6 +3199,30 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 400) self.assertEqual(StoragePath.objects.count(), 1) + def test_api_storage_path_placeholders(self): + """ + GIVEN: + - API request to create a storage path with placeholders + - Storage path is valid + WHEN: + - API is called + THEN: + - Correct HTTP response + - New storage path is created + """ + response = self.client.post( + self.ENDPOINT, + json.dumps( + { + "name": "Storage path with placeholders", + "path": "{title}/{correspondent}/{document_type}/{created}/{created_year}/{created_year_short}/{created_month}/{created_month_name}/{created_month_name_short}/{created_day}/{added}/{added_year}/{added_year_short}/{added_month}/{added_month_name}/{added_month_name_short}/{added_day}/{asn}/{tags}/{tag_list}/", + }, + ), + content_type="application/json", + ) + self.assertEqual(response.status_code, 201) + self.assertEqual(StoragePath.objects.count(), 2) + class TestTasks(DirectoriesMixin, APITestCase): ENDPOINT = "/api/tasks/" From 8b3d01c49bb17cae7221d61fcd7fa0b41cebd853 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sat, 11 Feb 2023 18:19:02 -0800 Subject: [PATCH 16/16] When splitting via barcodes, cleanup the split documents better --- src/documents/barcodes.py | 11 +++++------ src/documents/tasks.py | 29 ++++++++++++++++++----------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 9adb8aeea..416cf6b2d 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -325,11 +325,10 @@ def save_to_dir( Optionally rename the file. """ if os.path.isfile(filepath) and os.path.isdir(target_dir): - dst = shutil.copy(filepath, target_dir) - logging.debug(f"saved {str(filepath)} to {str(dst)}") - if newname: - dst_new = os.path.join(target_dir, newname) - logger.debug(f"moving {str(dst)} to {str(dst_new)}") - os.rename(dst, dst_new) + dest = target_dir + if newname is not None: + dest = os.path.join(dest, newname) + shutil.copy(filepath, dest) + logging.debug(f"saved {str(filepath)} to {str(dest)}") else: logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 34b75ce12..1f9c94917 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -128,6 +128,18 @@ def consume_file( ) if document_list: + + # If the file is an upload, it's in the scratch directory + # Move it to consume directory to be picked up + # Otherwise, use the current parent to keep possible tags + # from subdirectories + try: + # is_relative_to would be nicer, but new in 3.9 + _ = path.relative_to(settings.SCRATCH_DIR) + save_to_dir = settings.CONSUMPTION_DIR + except ValueError: + save_to_dir = path.parent + for n, document in enumerate(document_list): # save to consumption dir # rename it to the original filename with number prefix @@ -136,23 +148,18 @@ def consume_file( else: newname = None - # If the file is an upload, it's in the scratch directory - # Move it to consume directory to be picked up - # Otherwise, use the current parent to keep possible tags - # from subdirectories - try: - # is_relative_to would be nicer, but new in 3.9 - _ = path.relative_to(settings.SCRATCH_DIR) - save_to_dir = settings.CONSUMPTION_DIR - except ValueError: - save_to_dir = path.parent - barcodes.save_to_dir( document, newname=newname, target_dir=save_to_dir, ) + # Split file has been copied safely, remove it + os.remove(document) + + # And clean up the directory as well, now it's empty + shutil.rmtree(os.path.dirname(document_list[0])) + # Delete the PDF file which was split os.remove(doc_barcode_info.pdf_path)