def ocr_to_dateparser_languages() -> dict[str, str]: """ Translation map from languages supported by Tesseract OCR to languages supported by dateparser. To add a language, make sure it is supported by both libraries. The ISO 639-2 will help you link a 3-char to 2-char language code. Links: - Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html - Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html - ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php """ # TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard, # so we didn't find the equivalent in Tesseract: # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln, # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus, # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue return { "afr": "af", "amh": "am", "ara": "ar", "asm": "as", "ast": "ast", "aze": "az", "bel": "be", "bul": "bg", "ben": "bn", "bod": "bo", "bre": "br", "bos": "bs", "cat": "ca", "cher": "chr", "ces": "cs", "cym": "cy", "dan": "da", "deu": "de", "dzo": "dz", "ell": "el", "eng": "en", "epo": "eo", "spa": "es", "est": "et", "eus": "eu", "fas": "fa", "fin": "fi", "fil": "fil", "fao": "fo", # codespell:ignore "fra": "fr", "fry": "fy", "gle": "ga", "gla": "gd", "glg": "gl", "guj": "gu", "heb": "he", "hin": "hi", "hrv": "hr", "hun": "hu", "hye": "hy", "ind": "id", "isl": "is", "ita": "it", "jpn": "ja", "kat": "ka", "kaz": "kk", "khm": "km", "knda": "kn", "kor": "ko", "kir": "ky", "ltz": "lb", "lao": "lo", "lit": "lt", "lav": "lv", "mal": "ml", "mon": "mn", "mar": "mr", "msa": "ms", "mlt": "mt", "mya": "my", "nep": "ne", "nld": "nl", "ori": "or", "pan": "pa", "pol": "pl", "pus": "ps", "por": "pt", "que": "qu", "ron": "ro", "rus": "ru", "sin": "si", "slk": "sk", "slv": "sl", "sqi": "sq", "srp": "sr", "swe": "sv", "swa": "sw", "tam": "ta", "tel": "te", # codespell:ignore "tha": "th", # codespell:ignore "tir": "ti", "tgl": "tl", "ton": "to", "tur": "tr", "uig": "ug", "ukr": "uk", "urd": "ur", "uzb": "uz", "via": "vi", "yid": "yi", "yor": "yo", "chi": "zh", }