Module:Ko-translit/data

local p = {}

--[[

IMPORTANT NOTE before editing this module:

1. Make sure that you use a font that displays the following characters differently, and that you know the differences of them:

ᄀ (U+1100)

ᆨ (U+11A8)

ㄱ (U+3131)

2. When dealing with decomposed Hangul,

a. [ᄀ-ᄒ] should not be directly followed by [ᅡ-ᅵ] because MediaWiki uses Unicode Normalization Form C (NFC), which converts any sequence of [ᄀ-ᄒ][ᅡ-ᅵ] into a precomposed character; write ᄀ[ᅡ] or ᄀ(ᅡ)

b. ᄀ[ᅡ] or ᄀ(ᅡ) at the end of a pattern is equivalent to not just 가 but [가-갛] in precomposed form. To match a syllabic block without a final consonant at the end of a pattern, use both vowel + [^ᆨ-ᇂ] and vowel + $

For example, to only match 가 (and not [각-갛]) at the end of a pattern, use both ᄀ[ᅡ][^ᆨ-ᇂ] and ᄀ[ᅡ]$

--]]

--[[

pre-processing that applies to both RR and MR

IMPORTANT: Before adding a replacement, be sure to check if it can ALWAYS be applied in ALL contexts.

Good example: 싫증 → 실@증

Bad example: 문자 → 문@자 (affects words like 방문자 (pronounced [방문자], not [방문짜]))

--]]

p.preprocessing = {

-- _ for additional space in romanization only

{"_", " "},

-- for linguistic contexts

{"ㄴ([ᄀ-ᄒ])", "ᆫ%1"}, -- -ㄴ다

{"ㄹ([ᄀ-ᄒ])", "ᆯ%1"}, -- -ㄹ까, -ㄹ래

{"ㄹ@([ᄀᄃᄇᄉᄌ])", "ᆯ@%1"}, -- -ㄹ지

{"ㅁ([ᄀ-ᄒ])", "ᆷ%1"},

{"ㅂ([ᄀ-ᄒ])", "ᆸ%1"}, -- -ㅂ니다, -ㅂ시다

-- ㄴ-addition always occurs before 윷 and 잎

{"([ᆨ-ᇂ])ᄋ(ᅲᆾ)", "%1ᄂ%2"},

{"([ᆨ-ᇂ])ᄋ(ᅵᇁ)", "%1ᄂ%2"},

-- 곧이어 [고디어]

{"(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ][^ᆨ-ᇂ])", "%1ᄃ%2"},

{"(ᄀ[ᅩ])ᆮᄋ(ᅵᄋ[ᅥ])$", "%1ᄃ%2"},

-- 싫증 [실쯩]

{"(ᄉ[ᅵ])ᆶ(ᄌ[ᅳ]ᆼ)", "%1ᆯ@%2"},

-- 여덟 + particle (tensification does not occur)

{"(ᄋ[ᅧ]ᄃ[ᅥ])ᆲ([ᄀᄃᄇᄉᄌ])", "%1ᆯ%2"},

-- cases where ㄺㄱ is pronounced [ㄱㄲ]

-- not including very rarely used words such as 삼시욹, 안찱, 우줅거리다, etc.

{"([ᄃᄉᄐ]ᅡ)ᆰᄀ", "%1ᆨᄀ"}, -- 닭, 삵, 수탉/암탉

{"([ᄉᄒ]ᅳ)ᆰᄀ", "%1ᆨᄀ"}, -- 기슭, 흙

{"(ᄎ[ᅵ])ᆰᄀ", "%1ᆨᄀ"}, -- 칡

-- otherwise, ㄺㄱ is pronounced [ㄹㄲ] (usually verb/adjective stem ending in ㄺ + ending/suffix beginning with ㄱ (맑고 [말꼬], 긁개 [글깨]))

{"ᆰᄀ", "ᆯ@ᄀ"},

-- palatalization and ㅈ + -히-

{"ᆮᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄌ%1"}, -- 해돋이 [해도지]

{"ᆮᄋ(ᅵ[^ᆨ-ᇂ])", "ᄌ%1"},

{"ᆮᄋ(ᅵ)$", "ᄌ%1"},

{"[ᆮᆽ]ᄒ(ᅧᆻ)", "ᄎ%1"}, -- 굳히다 [구치다], 꽂히다 [꼬치다]

{"[ᆮᆽ]ᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1"},

{"[ᆮᆽ]ᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1"},

{"[ᆮᆽ]ᄒ([ᅧᅵ])$", "ᄎ%1"},

{"ᆴᄋ(ᅧᆻ)", "ᆯᄎ%1"}, -- 훑이다 [훌치다]

{"ᆴᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄎ%1"},

{"ᆴᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄎ%1"},

{"ᆴᄋ([ᅧᅵ])$", "ᆯᄎ%1"},

{"ᇀᄋ(ᅧᆻ)", "ᄎ%1"}, -- 붙이다 [부치다]

{"ᇀᄋ(ᅵ[ᆫᆯᆷᆸ])", "ᄎ%1"},

{"ᇀᄋ([ᅧᅵ][^ᆨ-ᇂ])", "ᄎ%1"},

{"ᇀᄋ([ᅧᅵ])$", "ᄎ%1"},

-- {ㄵ, ㄺ, ㄼ} + -히-

{"ᆬᄒ(ᅧᆻ)", "ᆫᄎ%1"}, -- 앉히다 [안치다]

{"ᆬᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆫᄎ%1"},

{"ᆬᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆫᄎ%1"},

{"ᆬᄒ([ᅧᅵ])$", "ᆫᄎ%1"},

{"ᆰᄒ(ᅧᆻ)", "ᆯᄏ%1"}, -- 밝히다 [발키다]

{"ᆰᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄏ%1"},

{"ᆰᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄏ%1"},

{"ᆰᄒ([ᅧᅵ])$", "ᆯᄏ%1"},

{"ᆲᄒ(ᅧᆻ)", "ᆯᄑ%1"}, -- 넓히다 [널피다], 밟히다 [발피다]

{"ᆲᄒ(ᅵ[ᆫᆯᆷᆸ])", "ᆯᄑ%1"},

{"ᆲᄒ([ᅧᅵ][^ᆨ-ᇂ])", "ᆯᄑ%1"},

{"ᆲᄒ([ᅧᅵ])$", "ᆯᄑ%1"},

-- cases where 넓- is pronounced [넙] before consonant

{"(ᄂ[ᅥ])ᆲ([ᄁᄄ-ᄈᄊᄍ-ᄒ])", "%1ᆸ%2"},

{"(ᄂ[ᅥ])ᆲ(ᄃ[ᅡ]ᄃ[ᅳ]ᆷ)", "%1ᆸ%2"}, -- 넓다듬이

{"(ᄂ[ᅥ])ᆲ(ᄃ[ᅮ]ᆼ)", "%1ᆸ%2"}, -- 넓둥글다

{"(ᄂ[ᅥ])ᆲ(ᄉ[ᅡ]ᆯᄆ[ᅮ]ᆫ)", "%1ᆸ%2"}, -- 넓살문

{"(ᄂ[ᅥ])ᆲ(ᄌ[ᅥᅮ]ᆨ)", "%1ᆸ%2"}, -- 넓적-, 넓죽-

-- 밟- is [밥] before consonant (except null-init consonant ㅇ)

{"(ᄇ[ᅡ])ᆲ([^ᄋ])", "%1ᆸ%2"},

{"(ᄇ[ᅡ])ᆲ$", "%1ᆸ"},

-- ㄵ, ㄼ, ㄾ cause tensification of following consonant

-- do not add ㄻ; does not always cause tensification (굶기다 [굼기다], 삶조차 [삼조차])

{"([ᆬᆲᆴ])([ᄀᄃᄌ])", "%1@%2"},

-- automatic 절음 법칙

{"(ᄋ[ᅥ])ᆹᄋ(ᅢ[ᆫᆯᆷᆸᆻ])", "%1ᆸᄉ%2"}, -- except 없애다 [업쌔다]

{"(ᄋ[ᅥ])ᆹᄋ(ᅢ[^ᆨ-ᇂ])", "%1ᆸᄉ%2"},

{"(ᄋ[ᅥ])ᆹᄋ(ᅢ)$", "%1ᆸᄉ%2"},

{"(ᄆ[ᅡᅥ])ᆺᄋ(ᅵᆻ)", "%1ᄉ%2"}, -- except 맛있다 and 멋있다 which are usually pronounced [마싣따] and [머싣따] respectively

{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅡᅥᅧ][ᆨ-ᆺᆼ-ᇂ])", "%1$%2"}, -- except 아, 았, 어, 었, 여, 였

{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅦ][ᆨ-ᆪᆬ-ᆮᆰ-ᇂ])", "%1$%2"}, -- except 에, 엔, 엘

{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅭᅴ][ᆨ-ᇂ])", "%1$%2"}, -- except 요, 의 (w/o final consonant)

{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅳᅵ][ᆨ-ᆪᆬ-ᆮᆰ-ᆶᆹ-ᇂ])", "%1$%2"}, -- except 으, 은, 을, 음, 읍, 이, 인, 일, 임, 입

{"([ᆩᆪᆬᆰ-ᆵᆹ-ᆻᆽ-ᇂ])(ᄋ[ᅢ-ᅤᅨ-ᅬᅮ-ᅲ])", "%1$%2"},

-- @ for ㄴ-addition

{"([ᆨ-ᇂ])@ᄋ([ᅣᅤᅧᅨᅭᅲᅵ])", "%1ᄂ%2"}, -- 색연필 [생년필], 물엿 [물렫]

-- for null-init consonant ㅇ (연음)

{"ᆨᄋ", "ᄀ"},

{"ᆩᄋ", "ᄁ"},

{"ᆪᄋ", "ᆨᄉ"},

{"ᆬᄋ", "ᆫᄌ"},

{"ᆮᄋ", "ᄃ"},

{"[ᆯᆶ]ᄋ", "ᄅ"},

{"ᆰᄋ", "ᆯᄀ"},

{"ᆱᄋ", "ᆯᄆ"},

{"ᆲᄋ", "ᆯᄇ"},

{"ᆳᄋ", "ᆯᄉ"},

{"ᆴᄋ", "ᆯᄐ"},

{"ᆵᄋ", "ᆯᄑ"},

{"ᆸᄋ", "ᄇ"},

{"ᆹᄋ", "ᆸᄉ"},

{"ᆺᄋ", "ᄉ"},

{"ᆻᄋ", "ᄊ"},

{"ᆽᄋ", "ᄌ"},

{"ᆾᄋ", "ᄎ"},

{"ᆿᄋ", "ᄏ"},

{"ᇀᄋ", "ᄐ"},

{"ᇁᄋ", "ᄑ"},

{"ᇂᄋ", "ᄋ"}, -- silent; 좋아 [조아]

-- convert ㅎ combinations

-- trivia: {ㄶ, ㅀ, ㅎ} + ㅂ doesn't actually exist, but added for completeness (syl-final ㅎ is for aspiration anyway)

{"ᆭᄀ", "ᆫᄏ"},

{"ᆭᄃ", "ᆫᄐ"},

{"ᆭᄇ", "ᆫᄑ"},

{"ᆭᄌ", "ᆫᄎ"},

{"ᆶᄀ", "ᆯᄏ"},

{"ᆶᄃ", "ᆯᄐ"},

{"ᆶᄇ", "ᆯᄑ"},

{"ᆶᄌ", "ᆯᄎ"},

{"ᇂᄀ", "ᄏ"},

{"ᇂᄃ", "ᄐ"},

{"ᇂᄇ", "ᄑ"},

{"ᇂᄌ", "ᄎ"}

}

-- should be done before neutralization of syl-final consonants (MR only)

p.before_neutralizing_syl_final_consonants_mr = {

-- additional ㅎ combinations

{"[ᆬᆭ]ᄉ", "ᆫᄊ"},

{"[ᆲᆴᆶ]ᄉ", "ᆯᄊ"},

{"ᇂᄉ", "ᄊ"},

-- @ for written 사이시옷 + ㄱ/ㅂ

{"ᆺ@ᄀ", "ᄁ"},

{"ᆺ@ᄇ", "ᄈ"}

}

-- neutralization of syl-final consonants

p.neutralize_syl_final_consonants = {

{"[ᆩᆪᆰᆿ]", "ᆨ"},

{"[ᆬᆭ]", "ᆫ"},

{"[ᆺᆻᆽᆾᇀᇂ]", "ᆮ"},

{"[ᆲᆳᆴᆶ]", "ᆯ"},

{"ᆱ", "ᆷ"},

{"[ᆵᆹᇁ]", "ᆸ"}

}

-- @ for ㄴㄹ pronounced [ㄴㄴ], $ for 절음 법칙

-- other irregularities documented are automatically handled

p.at_dollar_irregularities = {

{"ᆫ@ᄅ", "ᆫᄂ"}, -- 음운론 [으문논]

{"ᆨ%$ᄋ", "ᄀ"},

{"ᆮ%$ᄋ", "ᄃ"}, -- 웃어른 [우더른]

{"ᆯ%$ᄋ", "ᄅ"},

{"ᆸ%$ᄋ", "ᄇ"},

{"%$", ""}

}

-- @ for ㄱㅎ/ㄷㅎ/ㅂㅎ → k/t/p for RR only

p.at_irregularities_additional_rr = {

{"ᆨ@ᄒ", "ᄏ"},

{"ᆮ@ᄒ", "ᄐ"},

{"ᆸ@ᄒ", "ᄑ"},

{"@", ""}

}

-- cases where ㄱ, ㄷ, ㅂ, ㅈ become voiced consonants (MR only)

p.gdbj_mr = {

{"ᆫᄀ", "ᆫ'`ᄀ"}, -- n'g

{"([ᅡ-ᅵᆫᆯᆷᆼ])([ᄀᄃᄇᄌ])", "%1`%2"},

{"([ᅡ-ᅵᆫᆯᆷᆼ])%*([ᄀᄃᄇᄌ])", "%1-`%2"} -- * is for additional hyphen in romanization only (voicing is retained after hyphen)

}

p.consonant_assimilations = {

{"[ᆨᆼ][ᄂᄅ]", "ᆼᄂ"},

{"ᆨᄆ", "ᆼᄆ"},

{"ᆫᄅ", "ᆯᄅ"},

{"ᆮ[ᄂᄅ]", "ᆫᄂ"},

{"ᆮᄆ", "ᆫᄆ"},

{"ᆯᄂ", "ᆯᄅ"},

{"[ᆷᆸ][ᄂᄅ]", "ᆷᄂ"},

{"ᆸᄆ", "ᆷᄆ"}

}

-- additional consonant assimilations that apply to MR only

p.consonant_assimilations_additional_mr = {

-- no {kkk, ttt, ppp, sss/ts/tss, ttch}

{"ᆨᄁ", "ᄁ"},

{"ᆮᄄ", "ᄄ"},

{"ᆸᄈ", "ᄈ"},

{"ᆮ[ᄉᄊ]", "ᄊ"},

{"ᆮᄍ", "ᄍ"},

-- other misc conversions

{"ᆯᄅ", "ᆯl"},

{"ᆯᄒ", "rᄒ"},

{"ᄉ[ᅱ]", "shᅱ"}

}

-- drop y after {ㅈ, ㅉ, ㅊ}

p.drop_y = {

{"([ᄌ-ᄎ])ᅣ", "%1ᅡ"},

{"([ᄌ-ᄎ])ᅤ", "%1ᅢ"},

{"([ᄌ-ᄎ])ᅧ", "%1ᅥ"},

{"([ᄌ-ᄎ])ᅨ", "%1ᅦ"},

{"([ᄌ-ᄎ])ᅭ", "%1ᅩ"},

{"([ᄌ-ᄎ])ᅲ", "%1ᅮ"}

}

-- vowels to romanized text for RR

p.vowels_rr = {

{"[ᅡㅏ]", "a"},

{"[ᅢㅐ]", "ae"},

{"[ᅣㅑ]", "ya"},

{"[ᅤㅒ]", "yae"},

{"[ᅥㅓ]", "eo"},

{"[ᅦㅔ]", "e"},

{"[ᅧㅕ]", "yeo"},

{"[ᅨㅖ]", "ye"},

{"[ᅩㅗ]", "o"},

{"[ᅪㅘ]", "wa"},

{"[ᅫㅙ]", "wae"},

{"[ᅬㅚ]", "oe"},

{"[ᅭㅛ]", "yo"},

{"[ᅮㅜ]", "u"},

{"[ᅯㅝ]", "wo"},

{"[ᅰㅞ]", "we"},

{"[ᅱㅟ]", "wi"},

{"[ᅲㅠ]", "yu"},

{"[ᅳㅡ]", "eu"},

{"[ᅴㅢ]", "ui"},

{"[ᅵㅣ]", "i"}

}

-- vowels to romanized text for MR

p.vowels_mr = {

{"[ᅡㅏ]", "a"},

{"[ᅢㅐ]", "ae"},

{"[ᅣㅑ]", "ya"},

{"[ᅤㅒ]", "yae"},

{"[ᅥㅓ]", "ŏ"},

{"[ᅦㅔ]", "e"},

{"[ᅧㅕ]", "yŏ"},

{"[ᅨㅖ]", "ye"},

{"[ᅩㅗ]", "o"},

{"[ᅪㅘ]", "wa"},

{"[ᅫㅙ]", "wae"},

{"[ᅬㅚ]", "oe"},

{"[ᅭㅛ]", "yo"},

{"[ᅮㅜ]", "u"},

{"[ᅯㅝ]", "wŏ"},

{"[ᅰㅞ]", "we"},

{"[ᅱㅟ]", "wi"},

{"[ᅲㅠ]", "yu"},

{"[ᅳㅡ]", "ŭ"},

{"[ᅴㅢ]", "ŭi"},

{"[ᅵㅣ]", "i"}

}

-- single consonants to romanized text for RR

p.single_consonants_rr = {

{"[ᄀㄱ]", "g"},

{"[ᄁㄲ]", "kk"},

{"ㄳ", "ks"},

{"[ᄂᆫㄴ]", "n"},

{"ㄵ", "nj"},

{"ㄶ", "nh"},

{"[ᄃㄷ]", "d"},

{"[ᄄㄸ]", "tt"},

{"[ᄅㄹ]", "r"},

{"ᆯ", "l"},

{"ㄺ", "lg"},

{"ㄻ", "lm"},

{"ㄼ", "lb"},

{"ㄽ", "ls"},

{"ㄾ", "lt"},

{"ㄿ", "lp"},

{"ㅀ", "lh"},

{"[ᄆᆷㅁ]", "m"},

{"[ᄇㅂ]", "b"},

{"[ᄈㅃ]", "pp"},

{"ㅄ", "ps"},

{"[ᄉㅅ]", "s"},

{"[ᄊㅆ]", "ss"},

{"[ᄋㅇ]", ""},

{"ᆼ", "ng"},

{"[ᄌㅈ]", "j"},

{"[ᄍㅉ]", "jj"},

{"[ᄎㅊ]", "ch"},

{"[ᄏᆨㅋ]", "k"},

{"[ᄐᆮㅌ]", "t"},

{"[ᄑᆸㅍ]", "p"},

{"[ᄒㅎ]", "h"}

}

-- single consonants to romanized text for MR

p.single_consonants_mr = {

{"`ᄀ", "g"},

{"`ᄃ", "d"},

{"`ᄇ", "b"},

{"`ᄌ", "j"},

{"`", ""},

{"[ᄀᆨㄱ]", "k"},

{"[ᄁㄲ]", "kk"},

{"ㄳ", "ks"},

{"[ᄂᆫㄴ]", "n"},

{"ㄵ", "nj"},

{"ㄶ", "nh"},

{"[ᄃᆮㄷ]", "t"},

{"[ᄄㄸ]", "tt"},

{"[ᄅㄹ]", "r"},

{"ᆯ", "l"},

{"ㄺ", "lg"},

{"ㄻ", "lm"},

{"ㄼ", "lb"},

{"ㄽ", "ls"},

{"ㄾ", "lt'"},

{"ㄿ", "lp'"},

{"ㅀ", "rh"},

{"[ᄆᆷㅁ]", "m"},

{"[ᄇᆸㅂ]", "p"},

{"[ᄈㅃ]", "pp"},

{"ㅄ", "ps"},

{"[ᄉㅅ]", "s"},

{"[ᄊㅆ]", "ss"},

{"[ᄋㅇ]", ""},

{"ᆼ", "ng"},

{"[ᄌㅈ]", "ch"},

{"[ᄍㅉ]", "tch"},

{"[ᄎㅊ]", "ch'"},

{"[ᄏㅋ]", "k'"},

{"[ᄐㅌ]", "t'"},

{"[ᄑㅍ]", "p'"},

{"[ᄒㅎ]", "h"}

}

-- unwrapping enclosed Hangul text

-- actually not very necessary, but these are also classified as Hangul chars in Unicode

-- no distinction is made between parenthesized and circled chars

-- needs to be executed before decomposing Hangul

p.enclosed_hangul = {

{"[㈀㉠]", "(기역)"},

{"[㈁㉡]", "(니은)"},

{"[㈂㉢]", "(디귿)"},

{"[㈃㉣]", "(리을)"},

{"[㈄㉤]", "(미음)"},

{"[㈅㉥]", "(비읍)"},

{"[㈆㉦]", "(시옷)"},

{"[㈇㉧]", "(이응)"},

{"[㈈㉨]", "(지읒)"},

{"[㈉㉩]", "(치읓)"},

{"[㈊㉪]", "(키읔)"},

{"[㈋㉫]", "(티읕)"},

{"[㈌㉬]", "(피읖)"},

{"[㈍㉭]", "(히읗)"},

{"[㈎㉮]", "(가)"},

{"[㈏㉯]", "(나)"},

{"[㈐㉰]", "(다)"},

{"[㈑㉱]", "(라)"},

{"[㈒㉲]", "(마)"},

{"[㈓㉳]", "(바)"},

{"[㈔㉴]", "(사)"},

{"[㈕㉵]", "(아)"},

{"[㈖㉶]", "(자)"},

{"[㈗㉷]", "(차)"},

{"[㈘㉸]", "(카)"},

{"[㈙㉹]", "(타)"},

{"[㈚㉺]", "(파)"},

{"[㈛㉻]", "(하)"},

{"㈜", "(주)"},

{"㈝", "(오전)"},

{"㈞", "(오후)"},

{"㉼", "(참고)"},

{"㉽", "(주의)"},

{"㉾", "(우)"}

}

-- converting escaped special chars to HTML tags to preserve them

p.escaped_to_html_enc = {

{"\\%$", "$"},

{"\\%%", "%"},

{"\\%*", "*"},

{"\\@", "@"},

{"\\%^", "^"},

{"\\_", "_"},

{"\\`", "`"}

}

-- converting HTML tags back to unescaped chars

p.html_enc_to_ascii = {

{"$", "$"},

{"%", "%%"},

{"*", "*"},

{"@", "@"},

{"^", "^"},

{"_", "_"},

{"`", "`"}

}

return p