Module:Ancient Greek

local p = {}

local macron = mw.ustring.char(0x304)

local breve = mw.ustring.char(0x306)

local rough = mw.ustring.char(0x314)

local smooth = mw.ustring.char(0x313)

local diaeresis = mw.ustring.char(0x308)

local acute = mw.ustring.char(0x301)

local grave = mw.ustring.char(0x300)

local circumflex = mw.ustring.char(0x342)

local Latin_circumflex = mw.ustring.char(0x302)

local subscript = mw.ustring.char(0x345)

local macron_circumflex = macron .. diaeresis .. '?' .. Latin_circumflex

local is_velar = { ['κ'] = true, ['γ'] = true, ['χ'] = true, ['ξ'] = true, }

local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"

local basic_Greek = "[\206-\207][\128-\191]" -- excluding first line of Greek and Coptic block: ͰͱͲͳʹ͵Ͷͷͺͻͼͽ;Ϳ

local info = {}

-- The tables are shared among different characters so that they can be checked

-- for equality if needed, and to use less space.

local vowel = { vowel = true, diacritic_seat = true }

local iota = { vowel = true, diacritic_seat = true, offglide = true }

local upsilon = { vowel = true, diacritic_seat = true, offglide = true }

-- Technically rho is only a seat for rough or smooth breathing.

local rho = { consonant = true, diacritic_seat = true }

local consonant = { consonant = true }

local diacritic = { diacritic = true }

-- Needed for equality comparisons.

local breathing = { diacritic = true }

local function add_info(characters, t)

if type(characters) == "string" then

for character in string.gmatch(characters, UTF8_char) do

info[character] = t

end

else

for _, character in ipairs(characters) do

info[character] = t

end

end

end

add_info({ macron, breve,

diaeresis,

acute, grave, circumflex,

subscript,

}, diacritic)

add_info({rough, smooth}, breathing)

add_info("ΑΕΗΟΩαεηοω", vowel)

add_info("Ιι", iota)

add_info("Υυ", upsilon)

add_info("ΒΓΔΖΘΚΛΜΝΞΠΡΣΤΦΧΨϜϘϺϷͶϠβγδζθκλμνξπρσςτφχψϝϙϻϸͷϡ", consonant)

add_info("Ρρ", rho)

local not_recognized = {}

setmetatable(info, { __index =

function()

return not_recognized

end

})

local function quote(str)

return "“" .. str .. "”"

end

local correspondences = {

-- Vowels

["α"] = "a",

["ε"] = "e",

["η"] = "e" .. macron,

["ι"] = "i",

["ο"] = "o",

["υ"] = "u",

["ω"] = "o" .. macron,

-- Consonants

["β"] = "b",

["γ"] = "g",

["δ"] = "d",

["ζ"] = "z",

["θ"] = "th",

["κ"] = "k",

["λ"] = "l",

["μ"] = "m",

["ν"] = "n",

["ξ"] = "x",

["π"] = "p",

["ρ"] = "r",

["σ"] = "s",

["ς"] = "s",

["τ"] = "t",

["φ"] = "ph",

["ψ"] = "ps",

-- Archaic letters

["ϝ"] = "w",

["ϻ"] = "ś",

["ϙ"] = "q",

["ϡ"] = "š",

["ͷ"] = "v",

-- Diacritics

[smooth] = '',

[rough] = '', -- h is added below in the `transliterate` function.

[breve] = '',

}

local ALA_LC = {

["χ"] = "ch",

[acute] = '',

[grave] = '',

[circumflex] = '',

[subscript] = '',

[diaeresis] = '',

[macron] = '',

}

local Wiktionary_transliteration = {

["χ"] = "kh",

[circumflex] = Latin_circumflex,

[subscript] = 'i',

}

local function add_index_metamethod(t, index_metamethod)

local mt = getmetatable(t)

if not mt then

mt = {}

setmetatable(t, mt)

end

mt.__index = index_metamethod

end

--[=[

This breaks a word into meaningful "tokens", which are

individual letters or diphthongs with their diacritics.

Used by Module:grc-accent and Module:grc-pronunciation.

--]=]

local function tokenize(text)

local tokens, vowel_info, prev_info = {}, {}, {}

local token_i = 1

local prev

for character in string.gmatch(mw.ustring.toNFD(text), UTF8_char) do

local curr_info = info[character]

-- Split vowels between tokens if not a diphthong.

if curr_info.vowel then

if prev and (not (curr_info.offglide and prev_info.vowel)

-- υυ → υ, υ

-- ιυ → ι, υ

or prev_info.offglide and curr_info == upsilon) then

token_i = token_i + 1

end

tokens[token_i] = (tokens[token_i] or "") .. character

table.insert(vowel_info, { index = token_i })

elseif curr_info.diacritic then

tokens[token_i] = (tokens[token_i] or "") .. character

if prev_info.vowel or prev_info.diacritic then

if character == diaeresis then

-- Current token is vowel, vowel, possibly other diacritics,

-- and a diaeresis.

-- Split the current token into two:

-- the first letter, then the second letter plus any diacritics.

local previous_vowel, vowel_with_diaeresis = string.match(tokens[token_i], "^(" .. basic_Greek .. ")(" .. basic_Greek .. ".+)")

if previous_vowel then

tokens[token_i], tokens[token_i + 1] = previous_vowel, vowel_with_diaeresis

token_i = token_i + 1

end

end

elseif prev_info == rho then

if curr_info ~= breathing then

return string.format("The character %s cannot have the accent %s on it.", prev, "◌" .. character)

end

else

error("The character " .. quote(prev) .. " cannot have a diacritic on it.")

end

elseif curr_info == rho then

if prev and not (prev_info == breathing and info[string.match(tokens[token_i], "^" .. basic_Greek)] == rho) then

token_i = token_i + 1

end

tokens[token_i] = (tokens[token_i] or "") .. character

else

if prev then

token_i = token_i + 1

end

tokens[token_i] = (tokens[token_i] or "") .. character

end

prev = character

prev_info = curr_info

end

return tokens

end

function p.transliterate(text, system)

add_index_metamethod(correspondences, system == "ALA-LC" and ALA_LC or Wiktionary_transliteration)

if text == '῾' then

return 'h'

end

text = mw.ustring.toNFD(text)

--[[

Replace semicolon or Greek question mark with regular question mark,

except after an ASCII alphanumeric character (to avoid converting

semicolons in HTML entities).

--]]

text = mw.ustring.gsub(text, "([^A-Za-z0-9])[;" .. mw.ustring.char(0x37E) .. "]", "%1?")

-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.

text = text:gsub("·", ";")

local tokens = tokenize(text)

--now read the tokens

local output = {}

for i, token in pairs(tokens) do

-- substitute each character in the token for its transliteration

local translit = string.gsub(mw.ustring.lower(token), UTF8_char, correspondences)

if token == 'γ' and is_velar[tokens[i + 1]] then

-- γ before a velar should be

translit = 'n'

elseif token == 'ρ' and tokens[i - 1] == 'ρ' then

-- ρ after ρ should be

translit = 'rh'

elseif system == "Wiktionary" and mw.ustring.find(token, '^[αΑ].*' .. subscript .. '$') then

-- add macron to ᾳ

translit = mw.ustring.gsub(translit, '([aA])', '%1' .. macron)

end

if token:find(rough) then

if mw.ustring.find(token, '[Ρρ]') then

translit = translit .. 'h'

else -- vowel

translit = 'h' .. translit

end

end

if system == "ALA-LC" and mw.ustring.find(token, '^[υΥ][^ιΙ]*$') then

translit = translit:gsub('u', 'y'):gsub('U', 'Y')

end

-- Remove macron from a vowel that has a circumflex.

if mw.ustring.find(translit, macron_circumflex) then

translit = translit:gsub(macron, '')

end

-- Capitalize first character of transliteration.

if token ~= mw.ustring.lower(token) then

translit = mw.ustring.gsub(translit, "^.", mw.ustring.upper)

end

table.insert(output, translit)

end

return table.concat(output)

end

function p.translit(frame)

local text = frame.args[1] or frame:getParent().args[1]

local system = frame.args.system

if system == nil or system == "" then

system = "Wiktionary"

elseif not (system == "ALA-LC" or system == "Wiktionary") then

error('Transliteration system in |system= not recognized; choose between "ALA-LC" and "Wiktionary"')

end

local transliteration = p.transliterate(text, system)

return '' .. transliteration .. ''

end

function p.bare_translit(frame)

return p.transliterate(frame.args[1] or frame:getParent().args[1])

end

return p