paperless-ngx/src/paperless/utils.py

def ocr_to_dateparser_languages() -> dict[str, str]:
    """
    Translation map from languages supported by Tesseract OCR
    to languages supported by dateparser.
    To add a language, make sure it is supported by both libraries.
    The ISO 639-2 will help you link a 3-char to 2-char language code.
    Links:
    - Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
    - Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html
    - ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php
    """
    # TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,
    # so we didn't find the equivalent in Tesseract:
    # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
    # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
    # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
    return {
        "afr": "af",
        "amh": "am",
        "ara": "ar",
        "asm": "as",
        "ast": "ast",
        "aze": "az",
        "bel": "be",
        "bul": "bg",
        "ben": "bn",
        "bod": "bo",
        "bre": "br",
        "bos": "bs",
        "cat": "ca",
        "cher": "chr",
        "ces": "cs",
        "cym": "cy",
        "dan": "da",
        "deu": "de",
        "dzo": "dz",
        "ell": "el",
        "eng": "en",
        "epo": "eo",
        "spa": "es",
        "est": "et",
        "eus": "eu",
        "fas": "fa",
        "fin": "fi",
        "fil": "fil",
        "fao": "fo",  # codespell:ignore
        "fra": "fr",
        "fry": "fy",
        "gle": "ga",
        "gla": "gd",
        "glg": "gl",
        "guj": "gu",
        "heb": "he",
        "hin": "hi",
        "hrv": "hr",
        "hun": "hu",
        "hye": "hy",
        "ind": "id",
        "isl": "is",
        "ita": "it",
        "jpn": "ja",
        "kat": "ka",
        "kaz": "kk",
        "khm": "km",
        "knda": "kn",
        "kor": "ko",
        "kir": "ky",
        "ltz": "lb",
        "lao": "lo",
        "lit": "lt",
        "lav": "lv",
        "mal": "ml",
        "mon": "mn",
        "mar": "mr",
        "msa": "ms",
        "mlt": "mt",
        "mya": "my",
        "nep": "ne",
        "nld": "nl",
        "ori": "or",
        "pan": "pa",
        "pol": "pl",
        "pus": "ps",
        "por": "pt",
        "que": "qu",
        "ron": "ro",
        "rus": "ru",
        "sin": "si",
        "slk": "sk",
        "slv": "sl",
        "sqi": "sq",
        "srp": "sr",
        "swe": "sv",
        "swa": "sw",
        "tam": "ta",
        "tel": "te",  # codespell:ignore
        "tha": "th",  # codespell:ignore
        "tir": "ti",
        "tgl": "tl",
        "ton": "to",
        "tur": "tr",
        "uig": "ug",
        "ukr": "uk",
        "urd": "ur",
        "uzb": "uz",
        "via": "vi",
        "yid": "yi",
        "yor": "yo",
        "chi": "zh",
    }