mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-19 10:19:27 -05:00
Changes the NLTK language to be based on the Tesseract OCR language, with fallback to the default processing
This commit is contained in:
parent
d10d2f5a54
commit
c44c914d3d
@ -75,7 +75,8 @@ class DocumentClassifier:
|
|||||||
self.document_type_classifier = None
|
self.document_type_classifier = None
|
||||||
self.storage_path_classifier = None
|
self.storage_path_classifier = None
|
||||||
|
|
||||||
self.stemmer = None
|
self._stemmer = None
|
||||||
|
self._stop_words = None
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
# Catch warnings for processing
|
# Catch warnings for processing
|
||||||
@ -302,33 +303,44 @@ class DocumentClassifier:
|
|||||||
Process to contents of a document, distilling it down into
|
Process to contents of a document, distilling it down into
|
||||||
words which are meaningful to the content
|
words which are meaningful to the content
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Lower case the document
|
||||||
|
content = content.lower().strip()
|
||||||
|
# Reduce spaces
|
||||||
|
content = re.sub(r"\s+", " ", content)
|
||||||
|
# Get only the letters
|
||||||
|
content = re.sub(r"[^\w\s]", " ", content)
|
||||||
|
|
||||||
|
# If the NLTK language is supported, do further processing
|
||||||
|
if settings.NLTK_LANGUAGE is not None:
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from nltk.stem import SnowballStemmer
|
from nltk.stem import SnowballStemmer
|
||||||
|
|
||||||
import nltk
|
|
||||||
|
|
||||||
# Not really hacky, since it isn't private and is documented, but
|
# Not really hacky, since it isn't private and is documented, but
|
||||||
# set the search path for NLTK data to the single location it should be in
|
# set the search path for NLTK data to the single location it should be in
|
||||||
nltk.data.path = [settings.NLTK_DIR]
|
nltk.data.path = [settings.NLTK_DIR]
|
||||||
|
|
||||||
if self.stemmer is None:
|
# Do some one time setup
|
||||||
self.stemmer = SnowballStemmer("english")
|
if self._stemmer is None:
|
||||||
|
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
|
||||||
|
if self._stop_words is None:
|
||||||
|
self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
|
||||||
|
|
||||||
# Lower case the document
|
|
||||||
content = content.lower().strip()
|
|
||||||
# Get only the letters (remove punctuation too)
|
|
||||||
content = re.sub(r"[^\w\s]", " ", content)
|
|
||||||
# Tokenize
|
# Tokenize
|
||||||
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
|
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
|
||||||
# Remove stop words
|
# Remove stop words
|
||||||
stops = set(stopwords.words(settings.NLTK_LANGUAGE))
|
meaningful_words = [w for w in words if w not in self._stop_words]
|
||||||
meaningful_words = [w for w in words if w not in stops]
|
|
||||||
# Stem words
|
# Stem words
|
||||||
meaningful_words = [self.stemmer.stem(w) for w in meaningful_words]
|
meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
|
||||||
|
|
||||||
return " ".join(meaningful_words)
|
return " ".join(meaningful_words)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
def predict_correspondent(self, content):
|
def predict_correspondent(self, content):
|
||||||
if self.correspondent_classifier:
|
if self.correspondent_classifier:
|
||||||
X = self.data_vectorizer.transform([self.preprocess_content(content)])
|
X = self.data_vectorizer.transform([self.preprocess_content(content)])
|
||||||
|
@ -709,4 +709,30 @@ ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
|
|||||||
if ENABLE_UPDATE_CHECK != "default":
|
if ENABLE_UPDATE_CHECK != "default":
|
||||||
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
|
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
|
||||||
|
|
||||||
NLTK_LANGUAGE = os.getenv("PAPERLESS_NLTK_LANG", "english").lower()
|
|
||||||
|
def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Maps an ISO-639-1 language code supported by Tesseract into
|
||||||
|
an optional NLTK language name. This is the set of common supported
|
||||||
|
languages for all the NLTK data used.
|
||||||
|
"""
|
||||||
|
iso_code_to_nltk = {
|
||||||
|
"dan": "danish",
|
||||||
|
"nld": "dutch",
|
||||||
|
"eng": "english",
|
||||||
|
"fin": "finnish",
|
||||||
|
"fra": "french",
|
||||||
|
"deu": "german",
|
||||||
|
"ita": "italian",
|
||||||
|
"nor": "norwegian",
|
||||||
|
"por": "portuguese",
|
||||||
|
"rus": "russian",
|
||||||
|
"spa": "spanish",
|
||||||
|
"swe": "swedish",
|
||||||
|
"tur": "turkish",
|
||||||
|
}
|
||||||
|
|
||||||
|
return iso_code_to_nltk.get(ocr_lang, None)
|
||||||
|
|
||||||
|
|
||||||
|
NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user