Fixes language code checks around two part languages

This commit is contained in:
Trenton Holmes 2022-12-04 08:44:35 -08:00 committed by Trenton H
parent 0e8265f1ae
commit 55ef0d4a1b
2 changed files with 38 additions and 2 deletions

View File

@ -16,8 +16,7 @@ def get_tesseract_langs():
# Decode bytes to string, split on newlines, trim out the header
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
# Replace _ with - to convert two part languages to the expected code
return [x.replace("_", "-") for x in proc_lines]
return [x.strip() for x in proc_lines]
@register()

View File

@ -27,3 +27,40 @@ class TestChecks(TestCase):
msgs = check_default_language_available(None)
self.assertEqual(len(msgs), 1)
self.assertEqual(msgs[0].level, ERROR)
@override_settings(OCR_LANGUAGE="chi_sim")
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
def test_multi_part_language(self, m):
"""
GIVEN:
- An OCR language which is multi part (ie chi-sim)
- The language is correctly formatted
WHEN:
- Installed packages are checked
THEN:
- No errors are reported
"""
m.return_value = ["chi_sim", "eng"]
msgs = check_default_language_available(None)
self.assertEqual(len(msgs), 0)
@override_settings(OCR_LANGUAGE="chi-sim")
@mock.patch("paperless_tesseract.checks.get_tesseract_langs")
def test_multi_part_language_bad_format(self, m):
"""
GIVEN:
- An OCR language which is multi part (ie chi-sim)
- The language is correctly NOT formatted
WHEN:
- Installed packages are checked
THEN:
- No errors are reported
"""
m.return_value = ["chi_sim", "eng"]
msgs = check_default_language_available(None)
self.assertEqual(len(msgs), 1)
self.assertEqual(msgs[0].level, ERROR)