From 1e891414a385b8de180c7028ea904384e2b0b718 Mon Sep 17 00:00:00 2001 From: Trenton H Date: Fri, 23 Sep 2022 07:32:17 -0700 Subject: [PATCH] Allows disabling NLTK, adds it as a consideration for low power devices --- docs/setup.rst | 2 ++ src/documents/classifier.py | 2 +- src/paperless/settings.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/setup.rst b/docs/setup.rst index f2970fd9b..8a4adabe0 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -774,6 +774,8 @@ configuring some options in paperless can help improve performance immensely: OCR results. * If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to 1. This will save some memory. +* Consider setting ``PAPERLESS_ENABLE_NLTK`` to false, to disable the more + advanced language processing, which can take more memory and processing time. For details, refer to :ref:`configuration`. diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 666beffa7..2779fad7b 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -312,7 +312,7 @@ class DocumentClassifier: content = re.sub(r"[^\w\s]", " ", content) # If the NLTK language is supported, do further processing - if settings.NLTK_LANGUAGE is not None: + if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED: import nltk diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 45544b5c2..cd63105e2 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -709,6 +709,10 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") if ENABLE_UPDATE_CHECK != "default": ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK") +############################################################################### +# Machine Learning # +############################################################################### + def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: """ @@ -735,4 +739,6 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: return iso_code_to_nltk.get(ocr_lang, None) +NLTK_ENABLED: Final[bool] = __get_boolean("PAPERLESS_ENABLE_NLTK", "yes") + NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)