From 0bf9e55ca79983f4d5e5f42794f494f51f555345 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Thu, 1 Sep 2022 10:35:33 -0700 Subject: [PATCH 1/2] Fixes a minor TODO in settings, and enables flake8 for settings.py --- src/paperless/settings.py | 37 +++++++++++++++++++---------------- src/paperless_tika/apps.py | 2 +- src/paperless_tika/parsers.py | 6 +++--- src/setup.cfg | 2 +- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 7ec260b1a..515e2a05f 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -285,7 +285,7 @@ SECRET_KEY = os.getenv( AUTH_PASSWORD_VALIDATORS = [ { - "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", + "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa: E501 }, { "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", @@ -445,13 +445,14 @@ LOGGING = { TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1) -PAPERLESS_WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800) +WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800) # Per django-q docs, timeout must be smaller than retry -# We default retry to 10s more than the timeout -PAPERLESS_WORKER_RETRY: Final[int] = __get_int( +# We default retry to 10s more than the timeout to silence the +# warning, as retry functionality isn't used. +WORKER_RETRY: Final[int] = __get_int( "PAPERLESS_WORKER_RETRY", - PAPERLESS_WORKER_TIMEOUT + 10, + WORKER_TIMEOUT + 10, ) Q_CLUSTER = { @@ -459,8 +460,8 @@ Q_CLUSTER = { "guard_cycle": 5, "catch_up": False, "recycle": 1, - "retry": PAPERLESS_WORKER_RETRY, - "timeout": PAPERLESS_WORKER_TIMEOUT, + "retry": WORKER_RETRY, + "timeout": WORKER_TIMEOUT, "workers": TASK_WORKERS, "redis": os.getenv("PAPERLESS_REDIS", "redis://localhost:6379"), "log_level": "DEBUG" if DEBUG else "INFO", @@ -507,7 +508,7 @@ CONSUMER_IGNORE_PATTERNS = list( json.loads( os.getenv( "PAPERLESS_CONSUMER_IGNORE_PATTERNS", - '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', + '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501 ), ), ) @@ -589,7 +590,8 @@ DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") # Maximum number of dates taken from document start to end to show as suggestions for -# `created` date in the frontend. Duplicates are removed, which can result in fewer dates shown. +# `created` date in the frontend. Duplicates are removed, which can result in +# fewer dates shown. NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3) # Transformations applied before filename parsing @@ -600,7 +602,8 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")): # Specify the filename format for out files FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT") -# If this is enabled, variables in filename format will resolve to empty-string instead of 'none'. +# If this is enabled, variables in filename format will resolve to +# empty-string instead of 'none'. # Directories with 'empty names' are omitted, too. FILENAME_FORMAT_REMOVE_NONE = __get_boolean( "PAPERLESS_FILENAME_FORMAT_REMOVE_NONE", @@ -612,16 +615,15 @@ THUMBNAIL_FONT_NAME = os.getenv( "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf", ) -# TODO: this should not have a prefix. # Tika settings -PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO") -PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998") -PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( +TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO") +TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998") +TIKA_GOTENBERG_ENDPOINT = os.getenv( "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000", ) -if PAPERLESS_TIKA_ENABLED: +if TIKA_ENABLED: INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") @@ -634,8 +636,9 @@ def _parse_ignore_dates( user provided string(s) into dates Args: - env_ignore (str): The value of the environment variable, comma seperated dates - date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER. + env_ignore (str): The value of the environment variable, comma separated dates + date_order (str, optional): The format of the date strings. + Defaults to DATE_ORDER. Returns: Set[datetime.datetime]: The set of parsed date objects diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py index 5cab21427..012986543 100644 --- a/src/paperless_tika/apps.py +++ b/src/paperless_tika/apps.py @@ -9,6 +9,6 @@ class PaperlessTikaConfig(AppConfig): def ready(self): from documents.signals import document_consumer_declaration - if settings.PAPERLESS_TIKA_ENABLED: + if settings.TIKA_ENABLED: document_consumer_declaration.connect(tika_consumer_declaration) AppConfig.ready(self) diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 0cd0caeab..e706e3aa5 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -27,7 +27,7 @@ class TikaDocumentParser(DocumentParser): ) def extract_metadata(self, document_path, mime_type): - tika_server = settings.PAPERLESS_TIKA_ENDPOINT + tika_server = settings.TIKA_ENDPOINT try: parsed = parser.from_file(document_path, tika_server) except Exception as e: @@ -49,7 +49,7 @@ class TikaDocumentParser(DocumentParser): def parse(self, document_path, mime_type, file_name=None): self.log("info", f"Sending {document_path} to Tika server") - tika_server = settings.PAPERLESS_TIKA_ENDPOINT + tika_server = settings.TIKA_ENDPOINT try: parsed = parser.from_file(document_path, tika_server) @@ -73,7 +73,7 @@ class TikaDocumentParser(DocumentParser): def convert_to_pdf(self, document_path, file_name): pdf_path = os.path.join(self.tempdir, "convert.pdf") - gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT + gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT url = gotenberg_server + "/forms/libreoffice/convert" self.log("info", f"Converting {document_path} to PDF as {pdf_path}") diff --git a/src/setup.cfg b/src/setup.cfg index 3b50151a7..409c5c7cd 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -1,5 +1,5 @@ [flake8] -extend-exclude = */migrations/*, paperless/settings.py, */tests/* +extend-exclude = */migrations/*, */tests/* # E203 - https://www.flake8rules.com/rules/E203.html # W503 - https://www.flake8rules.com/rules/W503.html ignore = E203,W503 From d408900a916b02c5c5d5b52a17109a0cb9072852 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Fri, 9 Sep 2022 11:42:43 -0700 Subject: [PATCH 2/2] Adds validation and testing to cover some of the common settings --- src/paperless/__init__.py | 3 +- src/paperless/checks.py | 49 ++++++++++++++++ src/paperless/settings.py | 2 - src/paperless/tests/test_checks.py | 92 +++++++++++++++++++++++++++++- 4 files changed, 140 insertions(+), 6 deletions(-) diff --git a/src/paperless/__init__.py b/src/paperless/__init__.py index 8cdd600b3..1c7f09cbe 100644 --- a/src/paperless/__init__.py +++ b/src/paperless/__init__.py @@ -1,4 +1,5 @@ from .checks import binaries_check from .checks import paths_check +from .checks import settings_values_check -__all__ = ["binaries_check", "paths_check"] +__all__ = ["binaries_check", "paths_check", "settings_values_check"] diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 26d18b692..c9ac5cb6a 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -96,3 +96,52 @@ def debug_mode_check(app_configs, **kwargs): ] else: return [] + + +@register() +def settings_values_check(app_configs, **kwargs): + """ + Validates at least some of the user provided settings + """ + + def _ocrmypdf_settings_check(): + """ + Validates some of the arguments which will be provided to ocrmypdf + against the valid options. Use "ocrmypdf --help" to see the valid + inputs + """ + msgs = [] + if settings.OCR_OUTPUT_TYPE not in { + "pdfa", + "pdf", + "pdfa-1", + "pdfa-2", + "pdfa-3", + }: + msgs.append( + Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), + ) + + if settings.OCR_MODE not in {"force", "skip", "redo_ocr"}: + msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) + + if settings.OCR_CLEAN not in {"clean", "clean_final"}: + msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid')) + return msgs + + def _timezone_validate(): + """ + Validates the user provided timezone is a valid timezone + """ + try: + import zoneinfo + except ImportError: # pragma: nocover + import backports.zoneinfo as zoneinfo + msgs = [] + if settings.TIME_ZONE not in zoneinfo.available_timezones(): + msgs.append( + Error(f'Timezone "{settings.TIME_ZONE}" is not a valid timezone'), + ) + return msgs + + return _ocrmypdf_settings_check() + _timezone_validate() diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 515e2a05f..92042ac31 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -532,11 +532,9 @@ OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") # OCRmyPDF --output-type options are available. -# TODO: validate this setting. OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") # skip. redo, force -# TODO: validate this. OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index ba45ebf79..b2d8b5810 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -1,12 +1,12 @@ import os -import shutil from django.test import override_settings from django.test import TestCase from documents.tests.utils import DirectoriesMixin -from paperless import binaries_check -from paperless import paths_check +from paperless.checks import binaries_check from paperless.checks import debug_mode_check +from paperless.checks import paths_check +from paperless.checks import settings_values_check class TestChecks(DirectoriesMixin, TestCase): @@ -54,3 +54,89 @@ class TestChecks(DirectoriesMixin, TestCase): @override_settings(DEBUG=True) def test_debug_enabled(self): self.assertEqual(len(debug_mode_check(None)), 1) + + +class TestSettingsChecks(DirectoriesMixin, TestCase): + def test_all_valid(self): + """ + GIVEN: + - Default settings + WHEN: + - Settings are validated + THEN: + - No system check errors reported + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 0) + + @override_settings(OCR_OUTPUT_TYPE="notapdf") + def test_invalid_output_type(self): + """ + GIVEN: + - Default settings + - OCR output type is invalid + WHEN: + - Settings are validated + THEN: + - system check error reported for OCR output type + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 1) + + msg = msgs[0] + + self.assertIn('OCR output type "notapdf"', msg.msg) + + @override_settings(OCR_MODE="makeitso") + def test_invalid_ocr_type(self): + """ + GIVEN: + - Default settings + - OCR type is invalid + WHEN: + - Settings are validated + THEN: + - system check error reported for OCR type + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 1) + + msg = msgs[0] + + self.assertIn('OCR output mode "makeitso"', msg.msg) + + @override_settings(OCR_CLEAN="cleanme") + def test_invalid_ocr_clean(self): + """ + GIVEN: + - Default settings + - OCR cleaning type is invalid + WHEN: + - Settings are validated + THEN: + - system check error reported for OCR cleaning type + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 1) + + msg = msgs[0] + + self.assertIn('OCR clean mode "cleanme"', msg.msg) + + @override_settings(TIME_ZONE="TheMoon\\MyCrater") + def test_invalid_timezone(self): + """ + GIVEN: + - Default settings + - Timezone is invalid + WHEN: + - Settings are validated + THEN: + - system check error reported for timezone + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 1) + + msg = msgs[0] + + self.assertIn('Timezone "TheMoon\\MyCrater"', msg.msg)