diff --git a/docs/setup.rst b/docs/setup.rst index f2970fd9b..8a4adabe0 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -774,6 +774,8 @@ configuring some options in paperless can help improve performance immensely: OCR results. * If using docker, consider setting ``PAPERLESS_WEBSERVER_WORKERS`` to 1. This will save some memory. +* Consider setting ``PAPERLESS_ENABLE_NLTK`` to false, to disable the more + advanced language processing, which can take more memory and processing time. For details, refer to :ref:`configuration`. diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 666beffa7..2779fad7b 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -312,7 +312,7 @@ class DocumentClassifier: content = re.sub(r"[^\w\s]", " ", content) # If the NLTK language is supported, do further processing - if settings.NLTK_LANGUAGE is not None: + if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED: import nltk diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 45544b5c2..cd63105e2 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -709,6 +709,10 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") if ENABLE_UPDATE_CHECK != "default": ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK") +############################################################################### +# Machine Learning # +############################################################################### + def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: """ @@ -735,4 +739,6 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]: return iso_code_to_nltk.get(ocr_lang, None) +NLTK_ENABLED: Final[bool] = __get_boolean("PAPERLESS_ENABLE_NLTK", "yes") + NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)