Module:Wikt-lang/data

local U = mw.ustring.char

-- Diacritics, from the Combining Diacritical Marks block.

local grave = U(0x300)

local acute = U(0x301)

local circumflex = U(0x302)

local tilde = U(0x303)

local macron = U(0x304)

local breve = U(0x306)

local dot = U(0x307)

local diaeresis = U(0x308)

local double_acute = U(0x30B)

local caron = U(0x30C)

local double_grave = U(0x30F)

local invbreve = U(0x311)

local dot_below = U(0x323)

local undertie = U(0x35C)

--[[

This is a table of Wiktionary language codes with data belonging to them.

Name is the "canonical name" used on Wiktionary.

Article is the Wikipedia article.

Script is the ISO 15924 code.

]]

local data = {

["languages"] = {

["aaq"] = {

["name"] = "Penobscot",

["ab"] = {

["name"] = "Abkhaz",

["abe"] = {

["name"] = "Abenaki",

["ang"] = {

["name"] = "Old English",

["article"] = {"Old English"},

-- Remove macrons, acutes, and overdots

["replacements"] = {

decompose = true,

from = { "[" .. macron .. acute .. dot .. "]" },

["ar"] = {

["name"] = "Arabic",

["article"] = "Arabic language",

["direction"] = "rtl", -- Should be in the script data module.

["replacements"] = {

-- ālif with wasla is replaced by ālif;

[U(0x0671)] = U(0x0627),

-- taṭwīl, fatḥatan, ḍammatan, kasratan,

-- fatḥa, ḍamma, kasra,

-- shadda, sukūn, and superscript (dagger) ālif are removed.

["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

..U(0x064E)..U(0x064F)..U(0x0650)

..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",

["ara"] = {

["name"] = "Arabic",

["article"] = "Arabic language",

["direction"] = "rtl", -- Should be in the script data module.

["replacements"] = {

-- ālif with wasla is replaced by ālif;

[U(0x0671)] = U(0x0627),

-- taṭwīl, fatḥatan, ḍammatan, kasratan,

-- fatḥa, ḍamma, kasra,

-- shadda, sukūn, and superscript (dagger) ālif are removed.

["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

..U(0x064E)..U(0x064F)..U(0x0650)

..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",

["arb"] = {

["name"] = "Modern Standard Arabic",

["article"] = "Modern Standard Arabic",

["direction"] = "rtl", -- Should be in the script data module.

["replacements"] = {

-- ālif with wasla is replaced by ālif;

[U(0x0671)] = U(0x0627),

-- taṭwīl, fatḥatan, ḍammatan, kasratan,

-- fatḥa, ḍamma, kasra,

-- shadda, sukūn, and superscript (dagger) ālif are removed.

["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

..U(0x064E)..U(0x064F)..U(0x0650)

..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",

["apc"] = {

["name"] = "North Levantine Arabic",

["article"] = "North Levantine Arabic",

["direction"] = "rtl", -- Should be in the script data module.

["replacements"] = {

-- ālif with wasla is replaced by ālif;

[U(0x0671)] = U(0x0627),

-- taṭwīl, fatḥatan, ḍammatan, kasratan,

-- fatḥa, ḍamma, kasra,

-- shadda, sukūn, and superscript (dagger) ālif are removed.

["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

..U(0x064E)..U(0x064F)..U(0x0650)

..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",

["ajp"] = {

["name"] = "South Levantine Arabic",

["article"] = "South Levantine Arabic",

["direction"] = "rtl", -- Should be in the script data module.

["replacements"] = {

-- ālif with wasla is replaced by ālif;

[U(0x0671)] = U(0x0627),

-- taṭwīl, fatḥatan, ḍammatan, kasratan,

-- fatḥa, ḍamma, kasra,

-- shadda, sukūn, and superscript (dagger) ālif are removed.

["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

..U(0x064E)..U(0x064F)..U(0x0650)

..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",

["arz"] = {

["name"] = "Egyptian Arabic",

["article"] = "Egyptian Arabic",

["direction"] = "rtl", -- Should be in the script data module.

["replacements"] = {

-- ālif with wasla is replaced by ālif;

[U(0x0671)] = U(0x0627),

-- taṭwīl, fatḥatan, ḍammatan, kasratan,

-- fatḥa, ḍamma, kasra,

-- shadda, sukūn, and superscript (dagger) ālif are removed.

["["..U(0x0640)..U(0x064B)..U(0x064C)..U(0x064D)

..U(0x064E)..U(0x064F)..U(0x0650)

..U(0x0651)..U(0x0652)..U(0x0670).."]"] = "",

["av"] = {

["name"] = "Avar"

["be"] = {

["article"] = "Belarusian language",

["replacements"] = { [acute] = "", },

["bn"] = {

["name"] = "Bengali",

["article"] = "Bengali language",

["bua"] = {

["name"] = "Buryat",

["cel-pro"] = { -- Incorrect tag

["name"] = "Proto-Celtic",

["Wikipedia_code"] = "cel-x-proto",

["cel-x-proto"] = {

["name"] = "Proto-Celtic",

["cel-bry-pro"] = { -- Incorrect tag

["name"] = "Proto-Brythonic",

["article"] = "Common Brittonic",

["type"] = "reconstructed",

["com"] = {

["name"] = "Comanche",

["article"] = "Comanche language",

["cu"] = {

["name"] = "Old Church Slavonic",

["article"] = "Old Church Slavonic",

["de"] = {

["name"] = "German",

["article"] = "German language",

["en"] = {

["name"] = "English",

["article"] = "English language",

["es"] = {

["name"] = "Spanish",

["article"] = "Spanish language",

["egy"] = {

["name"] = "Egyptian",

["evn"] = {

["name"] = "Evenki",

["article"] = "Evenki language",

["fr"] = {

["name"] = "French",

["article"] = "French language",

["frm"] = {

["name"] = "Middle French",

["article"] = "Middle French",

["frp"] = {

["name"] = "Franco-Provençal",

["ff"] = {

["name"] = "Fula",

["gem-pro"] = { -- Incorrect tag

["name"] = "Proto-Germanic",

["article"] = "Proto-Germanic language",

["type"] = "reconstructed",

["replacements"] = {},

["Wikipedia_code"] = "gem-x-proto",

["gem-x-proto"] = {

["name"] = "Proto-Germanic",

["article"] = "Proto-Germanic language",

["type"] = "reconstructed",

["replacements"] = {},

["gml"] = {

["name"] = "Middle Low German",

["gmw-ecg"] = {

["name"] = "East Central German",

["gmw-x-proto"] = {

["name"] = "Proto-West Germanic",

["article"] = "Proto-West Germanic language",

["type"] = "reconstructed",

["replacements"] = {},

["gmq-x-gut"] = {

["name"] = "Gutnish",

["article"] = "Gutnish",

["goh"] = {

["replacements"] = {

decompose = true,

from = {

"[" .. macron .. circumflex .. diaeresis .. "]",

["got"] = {

["name"] = "Gothic",

["article"] = "Gothic language",

["replacements"] = {

-- Latin to Gothic since people will not want to have to copy

-- and paste Gothic letters in

["[AÁaáĀā]"] = "𐌰",

["[Bb]"] = "𐌱",

["[Gg]"] = "𐌲",

["[Dd]"] = "𐌳",

["[EeĒē]"] = "𐌴",

["[Qq]"] = "𐌵",

["[Zz]"] = "𐌶",

["[Hh]"] = "𐌷",

["[Þþ]"] = "𐌸",

["[IiÍí]"] = "𐌹",

["[Kk]"] = "𐌺",

["[Ll]"] = "𐌻",

["[Mm]"] = "𐌼",

["[Nn]"] = "𐌽",

["[Jj]"] = "𐌾",

["[UuÚúŪū]"] = "𐌿",

["[Pp]"] = "𐍀",

["[Rr]"] = "𐍂",

["[Ss]"] = "𐍃",

["[Tt]"] = "𐍄",

["[WwYy]"] = "𐍅",

["[Ff]"] = "𐍆",

["[Xx]"] = "𐍇",

["[Ƕƕ]"] = "𐍈", -- Not sure if "hw" and "hv" can safely be converted

["[OoŌō]"] = "𐍉",

["gsw"] = {

["name"] = "Alemannic German",

["grc"] = {

["name"] = "Ancient Greek",

["article"] = "Ancient Greek",

["replacements"] = {

decompose = true,

from = {

-- Replace variant letterforms with standard ones.

"ϐ", "ϵ", "ϑ", "ϰ", "ϱ", "ϲ", "ϕ",

-- Remove macrons and breves.

"[" .. macron .. breve .. undertie .. "]"

to = {

"β", "ε", "θ", "κ", "ρ", "σ", "φ",

}

["grk-pro"] = { -- Incorrect tag

["name"] = "Proto-Hellenic",

["Wikipedia_name"] = "Proto-Greek",

["article"] = "Proto-Greek language",

["type"] = "reconstructed",

["replacements"] = {},

["Wikipedia_code"] = "grk-x-proto",

["grk-x-proto"] = {

["name"] = "Proto-Hellenic",

["Wikipedia_name"] = "Proto-Greek",

["article"] = "Proto-Greek language",

["type"] = "reconstructed",

["replacements"] = {},

["grt"] = {

["name"] = "Garo",

["ha"] = {

["name"] = "Hausa",

-- remove tilde, grave, acute, macron, circumflex

["replacements"] = {

decompose = true,

from = { "[" .. grave .. circumflex .. macron .. acute .. tilde .. "]" },

["hi"] = {

["name"] = "Hindi",

["article"] = "Hindi",

["ilo"] = {

["name"] = "Ilocano",

["article"] = "Ilocano language",

["ine-bsl-pro"] = {

["name"] = "Proto-Balto-Slavic",

["article"] = "Proto-Balto-Slavic language",

["type"] = "reconstructed",

["ine-pro"] = { -- Incorrect tag

["name"] = "Proto-Indo-European",

["article"] = "Proto-Indo-European language",

["type"] = "reconstructed",

["replacements"] = {},

["Wikipedia_code"] = "ine-x-proto",

["ine-x-proto"] = {

["name"] = "Proto-Indo-European",

["article"] = "Proto-Indo-European language",

["type"] = "reconstructed",

["replacements"] = {},

["ja"] = {

["name"] = "Japanese",

["article"] = "Japanese language",

["jbo"] = { -- Lojban

["type"] = "appendix",

["ket"] = {

["name"] = "Ket",

["article"] = "Ket language",

["ksk"] = {

["name"] = "Kansa",

["article"] = "Kansa language",

["la"] = {

["name"] = "Latin",

["article"] = "Latin",

["replacements"] = {

decompose = true,

from = { "[" .. macron .. breve .. diaeresis .. "]" },

["lt"] = {

["name"] = "Lithuanian",

-- remove acute, tilde, grave

["replacements"] = {

decompose = true,

from = { "[" .. acute .. tilde .. grave .. "]" },

["mkh-mvi"] = {

["name"] = "Middle Vietnamese",

["moe"] = {

["name"] = "Cree",

["mul"] = {

["name"] = "Translingual",

["article"] = "",

["nci"] = {

["name"] = "Classical Nahuatl",

["article"] = "Classical Nahuatl",

-- Remove macrons, acutes, circumflexes and graves

["replacements"] = {

decompose = true,

-- Remove macrons, acutes, circumflexes, graves, and saltillo;

-- see Saltillo (linguistics).

from = { "[" .. grave .. acute .. macron .. circumflex .. "Ꞌꞌʻʼ'ʔ]" },

["nds-de"] = {

["name"] = "German Low German",

["non"] = {

["name"] = "Old Norse",

["non-x-proto"] = {

["name"] = "Proto-Norse",

["odt"] = {

["name"] = "Old Dutch",

["oge"] = {

["name"] = "Old Georgian",

["oj"] = {

["name"] = "Ojibwe",

["orv"] = {

["name"] = "Old East Slavic",

["article"] = "Old East Slavic",

["replacements"] = {

[U(0x484)] = "",

["osx"] = {

["name"] = "Old Saxon",

["pt"] = {

["name"] = "Portuguese",

["article"] = "Portuguese language",

-- ["scripts"] = { "Latn" },

["pa"] = {

["name"] = "Punjabi",

["article"] = "Punjabi language",

["pgl"] = {

["name"] = "Primitive Irish",

["article"] = "Primitive Irish",

["pis"] = {

["name"] = "Pijin",

["article"] = "Pijin language",

["poz-x-poly-proto"] = {

["name"] = "Proto-Nuclear Polynesian",

["article"] = "Proto-Polynesian language",

["type"] = "reconstructed",

["rap"] = {

["name"] = "Rapa Nui",

["article"] = "Rapa Nui language",

["ru"] = {

["name"] = "Russian",

["article"] = "Russian language",

["replacements"] = { [acute] = "", },

["rw"] = {

["name"] = "Rwanda-Rundi",

["se"] = {

["replacements"] = {

["([đflmnŋrsšŧv])'%1"] = "%1%1",

["sem-pro"] = {

["name"] = "Proto-Semitic",

["article"] = "Proto-Semitic",

["type"] = "reconstructed",

["sh"] = {

["article"] = "Serbo-Croatian language",

["replacements"] = {

decompose = true,

from = { "([AaEeIiOoUuRrАаЕеИиОоУуРр])[" .. double_grave

.. grave .. invbreve .. acute .. macron .. tilde .. "]" },

to = { "%1" },

["sl"] = {

["name"] = "Slovene",

["replacements"] = {

decompose = true,

-- remove tonal orthography

from = {"ł", "[" .. grave .. acute .. macron .. double_grave .. invbreve .. circumflex .. dot_below .. "]"},

to = {"l"},

["sla-pro"] = {

["name"] = "Proto-Slavic", -- also Common Slavic

["type"] = "reconstructed",

["replacements"] = {

["[ÀÁÃĀȀȂ]"] = "A",

["[àáãāȁȃ]"] = "a",

["[ÈÉẼĒȄȆ]"] = "E",

["[èéẽēȅȇ]"] = "e",

["[ÌÍĨĪȈȊ]"] = "I",

["[ìíĩīȉȋ]"] = "i",

["[ÒÓÕŌȌȎŐ]"] = "O",

["[òóõōȍȏő]"] = "o",

["[ÙÚŨŪȔȖŰ]"] = "U",

["[ùúũūȕȗű]"] = "u",

["[ỲÝỸȲ]"] = "Y",

["[ỳýỹȳ]"] = "y",

["Ǭ"] = "Ǫ",

["ǭ"] = "ǫ",

["[" .. grave .. acute .. double_acute .. tilde .. macron .. double_grave .. invbreve .. "]"] = "",

["ĭ"] = "ь",

["ŭ"] = "ъ",

["tts"] = {

["name"] = "Isan", -- also "Northeastern Thai"

["article"] = "Isan language",

["tzo"] = {

["name"] = "Tzotzil",

["article"] = "Tzotzil language",

["ug"] = {

["name"] = "Uyghur", --also less commonly "Uighur"

["article"] = "Uyghur language",

["uk"] = {

["article"] = "Ukrainian language",

["replacements"] = { [acute] = "", }

["ur"] = {

["name"] = "Urdu",

["article"] = "Urdu",

["xcl"] = {

["name"] = "Old Armenian",

["article"] = "Classical Armenian",

["replacements"] = {

["[՞՜՛՟]"] = "",

["և"] = "եւ",

["xgf"] = {

["name"] = "Tongva", -- not ISO name "Gabrielino-Fernandeño"

["article"] = "Tongva language",

["replacements"] = {

["['`ʔ]"] = "ʼ",

["xlu"] = {

["name"] = "Luwian", -- not ISO name "Cuneiform Luwian"

["article"] = "Cuneiform Luwian"

["xpq"] = {

["name"] = "Mohegan-Pequot",

["xxt"] = {

["name"] = "Tambora",

["article"] = "Tambora language",

["xvn"] = {

["name"] = "Vandalic",

["article"] = "Vandalic language",

["yua"] = {

["name"] = "Yucatec Maya",

["article"] = "Yucatec Maya language",

["zh"] = {

["name"] = "Chinese",

["article"] = "Chinese language",

-- ["scripts"] = { "Hani" },

["zle-ort"] = {

["name"] = "Old Ruthenian",

["article"] = "Old Ruthenian",

["replacements"] = { [acute] = "", },

-- Here, keys (for example, "gem") are Wikipedia language codes used in

-- {{lang}}, and values (for example, "gem-pro") are the equivalent Wiktionary

-- code.

-- Subtags are not currently supported.

["redirects"] = {

["aae"] = "sq",

["aiq"] = "fa",

["aln"] = "sq",

["als"] = "sq",

["azb"] = "az",

["azj"] = "az",

["bgn"] = "bal",

["bs"] = "sh",

["bxr"] = "bua",

["ciw"] = "oj",

["cnr"] = "sh",

["fil"] = "tl",

["fuf"] = "ff",

["gem"] = "gem-pro", -- Not correct, but is commonly used.

["hak"] = "zh",

["hbo"] = "he",

["hr"] = "sh",

["ine"] = "ine-pro", -- Not correct, but might be commonly used.

["kjv"] = "sh",

["nan"] = "zh",

["prs"] = "fa",

["rn"] = "rw",

["sli"] = "gmw-ecg",

["sr"] = "sh",

["src"] = "sc",

["sro"] = "sc",

["tw"] = "ak",

["wae"] = "gsw",

["wep"] = "nds-de",

["yue"] = "zh",

["xno"] = "fro",

}

return data