diff --git a/src/paperless_tesseract/checks.py b/src/paperless_tesseract/checks.py index c63761f31..ed5725d36 100644 --- a/src/paperless_tesseract/checks.py +++ b/src/paperless_tesseract/checks.py @@ -16,8 +16,7 @@ def get_tesseract_langs(): # Decode bytes to string, split on newlines, trim out the header proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:] - # Replace _ with - to convert two part languages to the expected code - return [x.replace("_", "-") for x in proc_lines] + return [x.strip() for x in proc_lines] @register() diff --git a/src/paperless_tesseract/tests/test_checks.py b/src/paperless_tesseract/tests/test_checks.py index cfac11d3c..4d46ad9a3 100644 --- a/src/paperless_tesseract/tests/test_checks.py +++ b/src/paperless_tesseract/tests/test_checks.py @@ -27,3 +27,40 @@ class TestChecks(TestCase): msgs = check_default_language_available(None) self.assertEqual(len(msgs), 1) self.assertEqual(msgs[0].level, ERROR) + + @override_settings(OCR_LANGUAGE="chi_sim") + @mock.patch("paperless_tesseract.checks.get_tesseract_langs") + def test_multi_part_language(self, m): + """ + GIVEN: + - An OCR language which is multi part (ie chi-sim) + - The language is correctly formatted + WHEN: + - Installed packages are checked + THEN: + - No errors are reported + """ + m.return_value = ["chi_sim", "eng"] + + msgs = check_default_language_available(None) + + self.assertEqual(len(msgs), 0) + + @override_settings(OCR_LANGUAGE="chi-sim") + @mock.patch("paperless_tesseract.checks.get_tesseract_langs") + def test_multi_part_language_bad_format(self, m): + """ + GIVEN: + - An OCR language which is multi part (ie chi-sim) + - The language is correctly NOT formatted + WHEN: + - Installed packages are checked + THEN: + - No errors are reported + """ + m.return_value = ["chi_sim", "eng"] + + msgs = check_default_language_available(None) + + self.assertEqual(len(msgs), 1) + self.assertEqual(msgs[0].level, ERROR)