Changes the NLTK language to be based on the Tesseract OCR language, with fallback to the default processing

This commit is contained in:
Trenton Holmes 2022-09-18 08:48:26 -07:00 committed by Trenton H
parent d10d2f5a54
commit c44c914d3d
2 changed files with 61 additions and 23 deletions

View File

@ -75,7 +75,8 @@ class DocumentClassifier:
self.document_type_classifier = None
self.storage_path_classifier = None
self.stemmer = None
self._stemmer = None
self._stop_words = None
def load(self):
# Catch warnings for processing
@ -302,32 +303,43 @@ class DocumentClassifier:
Process to contents of a document, distilling it down into
words which are meaningful to the content
"""
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
if self.stemmer is None:
self.stemmer = SnowballStemmer("english")
# Lower case the document
content = content.lower().strip()
# Get only the letters (remove punctuation too)
# Reduce spaces
content = re.sub(r"\s+", " ", content)
# Get only the letters
content = re.sub(r"[^\w\s]", " ", content)
# Tokenize
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Remove stop words
stops = set(stopwords.words(settings.NLTK_LANGUAGE))
meaningful_words = [w for w in words if w not in stops]
# Stem words
meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]
return " ".join(meaningful_words)
# If the NLTK language is supported, do further processing
if settings.NLTK_LANGUAGE is not None:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
# Do some one time setup
if self._stemmer is None:
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
if self._stop_words is None:
self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
# Tokenize
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Remove stop words
meaningful_words = [w for w in words if w not in self._stop_words]
# Stem words
meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
return " ".join(meaningful_words)
return content
def predict_correspondent(self, content):
if self.correspondent_classifier:

View File

@ -709,4 +709,30 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
if ENABLE_UPDATE_CHECK != "default":
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
"""
Maps an ISO-639-1 language code supported by Tesseract into
an optional NLTK language name. This is the set of common supported
languages for all the NLTK data used.
"""
iso_code_to_nltk = {
"dan": "danish",
"nld": "dutch",
"eng": "english",
"fin": "finnish",
"fra": "french",
"deu": "german",
"ita": "italian",
"nor": "norwegian",
"por": "portuguese",
"rus": "russian",
"spa": "spanish",
"swe": "swedish",
"tur": "turkish",
}
return iso_code_to_nltk.get(ocr_lang, None)
NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)