From bcdcfbaee045541dd3d325d9748df10dd831c3a4 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 23 Jan 2016 02:33:04 +0000 Subject: [PATCH] Added a manual language lookup based on ISO639 --- src/documents/languages.py | 194 +++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 src/documents/languages.py diff --git a/src/documents/languages.py b/src/documents/languages.py new file mode 100644 index 000000000..2bfafe08a --- /dev/null +++ b/src/documents/languages.py @@ -0,0 +1,194 @@ +# Thanks to the Library of Congress and some creative use of sed and awk: +# http://www.loc.gov/standards/iso639-2/php/English_list.php + +ISO639 = { + + "aa": "aar", + "ab": "abk", + "ae": "ave", + "af": "afr", + "ak": "aka", + "am": "amh", + "an": "arg", + "ar": "ara", + "as": "asm", + "av": "ava", + "ay": "aym", + "az": "aze", + "ba": "bak", + "be": "bel", + "bg": "bul", + "bh": "bih", + "bi": "bis", + "bm": "bam", + "bn": "ben", + "bo": "bod", + "br": "bre", + "bs": "bos", + "ca": "cat", + "ce": "che", + "ch": "cha", + "co": "cos", + "cr": "cre", + "cs": "ces", + "cu": "chu", + "cv": "chv", + "cy": "cym", + "da": "dan", + "de": "deu", + "dv": "div", + "dz": "dzo", + "ee": "ewe", + "el": "ell", + "en": "eng", + "eo": "epo", + "es": "spa", + "et": "est", + "eu": "eus", + "fa": "fas", + "ff": "ful", + "fi": "fin", + "fj": "fij", + "fo": "fao", + "fr": "fra", + "fy": "fry", + "ga": "gle", + "gd": "gla", + "gl": "glg", + "gn": "grn", + "gu": "guj", + "gv": "glv", + "ha": "hau", + "he": "heb", + "hi": "hin", + "ho": "hmo", + "hr": "hrv", + "ht": "hat", + "hu": "hun", + "hy": "hye", + "hz": "her", + "ia": "ina", + "id": "ind", + "ie": "ile", + "ig": "ibo", + "ii": "iii", + "ik": "ipk", + "io": "ido", + "is": "isl", + "it": "ita", + "iu": "iku", + "ja": "jpn", + "jv": "jav", + "ka": "kat", + "kg": "kon", + "ki": "kik", + "kj": "kua", + "kk": "kaz", + "kl": "kal", + "km": "khm", + "kn": "kan", + "ko": "kor", + "kr": "kau", + "ks": "kas", + "ku": "kur", + "kv": "kom", + "kw": "cor", + "ky": "kir", + "la": "lat", + "lb": "ltz", + "lg": "lug", + "li": "lim", + "ln": "lin", + "lo": "lao", + "lt": "lit", + "lu": "lub", + "lv": "lav", + "mg": "mlg", + "mh": "mah", + "mi": "mri", + "mk": "mkd", + "ml": "mal", + "mn": "mon", + "mr": "mar", + "ms": "msa", + "mt": "mlt", + "my": "mya", + "na": "nau", + "nb": "nob", + "nd": "nde", + "ne": "nep", + "ng": "ndo", + "nl": "nld", + "no": "nor", + "nr": "nbl", + "nv": "nav", + "ny": "nya", + "oc": "oci", + "oj": "oji", + "om": "orm", + "or": "ori", + "os": "oss", + "pa": "pan", + "pi": "pli", + "pl": "pol", + "ps": "pus", + "pt": "por", + "qu": "que", + "rm": "roh", + "rn": "run", + "ro": "ron", + "ru": "rus", + "rw": "kin", + "sa": "san", + "sc": "srd", + "sd": "snd", + "se": "sme", + "sg": "sag", + "si": "sin", + "sk": "slk", + "sl": "slv", + "sm": "smo", + "sn": "sna", + "so": "som", + "sq": "sqi", + "sr": "srp", + "ss": "ssw", + "st": "sot", + "su": "sun", + "sv": "swe", + "sw": "swa", + "ta": "tam", + "te": "tel", + "tg": "tgk", + "th": "tha", + "ti": "tir", + "tk": "tuk", + "tl": "tgl", + "tn": "tsn", + "to": "ton", + "tr": "tur", + "ts": "tso", + "tt": "tat", + "tw": "twi", + "ty": "tah", + "ug": "uig", + "uk": "ukr", + "ur": "urd", + "uz": "uzb", + "ve": "ven", + "vi": "vie", + "vo": "vol", + "wa": "wln", + "wo": "wol", + "xh": "xho", + "yi": "yid", + "yo": "yor", + "za": "zha", + + # Tessdata contains two values for Chinese, "chi_sim" and "chi_tra". I have + # no idea which one is better, so I just picked the bigger file. + "zh": "chi_tra", + + "zu": "zul" + +} \ No newline at end of file