Merge pull request #2057 from paperless-ngx/fix/2044-lang-code-diffs

Bugfix: Some tesseract languages aren't detected as installed.
This commit is contained in:
shamoon 2022-11-28 11:04:44 -08:00 committed by GitHub
commit 5d3a6e230d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,3 +1,4 @@
import shutil
import subprocess import subprocess
from django.conf import settings from django.conf import settings
@ -7,10 +8,16 @@ from django.core.checks import Warning
def get_tesseract_langs(): def get_tesseract_langs():
with subprocess.Popen(["tesseract", "--list-langs"], stdout=subprocess.PIPE) as p: proc = subprocess.run(
stdout, stderr = p.communicate() [shutil.which("tesseract"), "--list-langs"],
capture_output=True,
)
return stdout.decode().strip().split("\n")[1:] # Decode bytes to string, split on newlines, trim out the header
proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
# Replace _ with - to convert two part languages to the expected code
return [x.replace("_", "-") for x in proc_lines]
@register() @register()