From 55ef0d4a1b62c3abe8500cad97ddeecf9f746b84 Mon Sep 17 00:00:00 2001
From: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
Date: Sun, 4 Dec 2022 08:44:35 -0800
Subject: [PATCH] Fixes language code checks around two part languages

---
 src/paperless_tesseract/checks.py            |  3 +-
 src/paperless_tesseract/tests/test_checks.py | 37 ++++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/paperless_tesseract/checks.py b/src/paperless_tesseract/checks.py
index c63761f31..ed5725d36 100644
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -16,8 +16,7 @@ def get_tesseract_langs():
     # Decode bytes to string, split on newlines, trim out the header
     proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
 
-    # Replace _ with - to convert two part languages to the expected code
-    return [x.replace("_", "-") for x in proc_lines]
+    return [x.strip() for x in proc_lines]
 
 
 @register()
diff --git a/src/paperless_tesseract/tests/test_checks.py b/src/paperless_tesseract/tests/test_checks.py
index cfac11d3c..4d46ad9a3 100644
--- a/src/paperless_tesseract/tests/test_checks.py
+++ b/src/paperless_tesseract/tests/test_checks.py
@@ -27,3 +27,40 @@ class TestChecks(TestCase):
         msgs = check_default_language_available(None)
         self.assertEqual(len(msgs), 1)
         self.assertEqual(msgs[0].level, ERROR)
+
+    @override_settings(OCR_LANGUAGE="chi_sim")
+    @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
+    def test_multi_part_language(self, m):
+        """
+        GIVEN:
+            - An OCR language which is multi part (ie chi-sim)
+            - The language is correctly formatted
+        WHEN:
+            - Installed packages are checked
+        THEN:
+            - No errors are reported
+        """
+        m.return_value = ["chi_sim", "eng"]
+
+        msgs = check_default_language_available(None)
+
+        self.assertEqual(len(msgs), 0)
+
+    @override_settings(OCR_LANGUAGE="chi-sim")
+    @mock.patch("paperless_tesseract.checks.get_tesseract_langs")
+    def test_multi_part_language_bad_format(self, m):
+        """
+        GIVEN:
+            - An OCR language which is multi part (ie chi-sim)
+            - The language is correctly NOT formatted
+        WHEN:
+            - Installed packages are checked
+        THEN:
+            - No errors are reported
+        """
+        m.return_value = ["chi_sim", "eng"]
+
+        msgs = check_default_language_available(None)
+
+        self.assertEqual(len(msgs), 1)
+        self.assertEqual(msgs[0].level, ERROR)