Module:Lang/data/iana languages/make

require('strict');

--[=[------------------------< G E T _ V A R I A N T _ P A R T S >---------------------------------------------

We get a record that looks more-or-less like this:

%%\n

Type: variant\n

Subtag: bohoric\n

Description: Slovene in Bohorič alphabet\n

Added: 2012-06-27\n

Prefix: sl\n

Each line is terminated with a \n character.

Type, for this function can only be 'variant'

Subtag is the code of Type

Prefix is a language code to which this variant applies; one language code per Prefix line. There can be

more than one prefix line.

Description associates Subtag with a proper name or names; one name per Description line. There can be more

than one Description line and Description lines can wrap to the next line. When they do, the first two

characters of the continuation line are spaces.

Comments: lines can also be continued so once in a Comments line (which is otherwise ignored) all further

continuations in the record are also ignored. This is a crude mechanism to prevent comment continuations

from being concatenated onto the end of descriptions and relies on Description line occuring in the record

before the Comments line.

Records with private use subtags are ignored.

]=]

local function get_variant_parts (record)

local code;

local descriptions = {};

local prefixes = {};

local in_comments = false;

if string.find (record, 'Deprecated', 1, true) or string.find (record, 'Preferred-Value', 1, true)

or string.find (record, 'Private use', 1, true) then

return 'skip';

end

for line in string.gmatch (record, '([^\n]+)\n') do -- get a \n terminated line of text (without the \n)

local label = string.match(line, "(.-):")

if not label and string.find (line, '^ .+') and not in_comments then -- if a continuation line but not a comments continuation

descriptions[#descriptions] = string.gsub (descriptions[#descriptions], '\"$', ''); -- remove trailing quote mark from previous description

descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. string.match (line, '^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark

elseif label == 'Subtag' then -- if this line is the subtag line

code = string.match (line, 'Subtag: (%w+)'); -- extract and save to subtag's code

elseif label == 'Description' then -- if this line is a description line

local desc = string.match (line, 'Description: (.+)'); -- extract the description

desc = string.gsub (desc, '"', '\\"'); -- in case description contains quote marks (see 1959acad)

table.insert (descriptions, '\"' .. desc .. '\"'); -- save the description wrapped in quote marks

elseif label == 'Prefix' then -- if this line is a prefix line

table.insert (prefixes, '\"' .. string.match (line, 'Prefix: (.+)'):lower() .. '\"'); -- extract and save the prefix wrapped in quote marks

elseif label == 'Comments' then -- if this line is a comments line

in_comments = true;

end

end

return code, table.concat (prefixes, ', '), table.concat (descriptions, ', ');

end

--[=[------------------------< G E T _ L A N G _ S C R I P T _ R E G I O N _ P A R T S >-----------------------

We get a record that looks more-or-less like this:

%%\n

Type: language\n

Subtag: aa\n

Description: Afar\n

Added: 2005-10-16\n

Each line is terminated with a \n character.

Type, for our purposes can be 'language', 'script', or 'region'

Subtag is the code of Type

Description associates Subtag with a proper name or names; one name per Description line. There can be more

than one Description line and Description lines can wrap to the next line. When they do, the first two

characters of the continuation line are spaces.

Comments: lines can also be continued so once in a Comments line (which is otherwise ignored) all further

continuations in the record are also ignored. This is a crude mechanism to prevent comment continuations

from being concatenated onto the end of descriptions and relies on Description line occuring in the record

before the Comments line.

Records with private use subtags are ignored.

]=]

local function get_lang_script_region_parts (record)

local code;

local suppress; -- Suppress script for this code if specified

local deprecated; -- boolean; true when subtag is deprecated

local descriptions = {};

local in_comments = false;

if record:find ('Private use') then

return 'skip';

end

for line in record:gmatch ('([^\n]+)\n') do -- get a \n terminated line of text (without the \n)

local label = line:match ('(.-):');

if 'Subtag' == label then -- if this line is the subtag line

code = line:match ('Subtag: (%w+)'); -- extract and save to subtag's code

elseif 'Description' == label then -- if this line is a description line

table.insert (descriptions, '\"' .. line:match ('Description: (.+)') .. '\"'); -- extract and save the name wrapped in quote marks

elseif 'Deprecated' == label then

deprecated = true; -- subtag is deprecated; set our flag

elseif 'Suppress-Script' == label then

suppress = line:match ('Suppress%-Script: (%S+)');

elseif 'Comments' == label then -- if this line is a comments line

in_comments = true;

elseif line:find ('^ .+') and not in_comments then -- if a continuation line but not a commnets continuation

descriptions[#descriptions] = descriptions[#descriptions]:gsub ('\"$', ''); -- remove trailing quote mark from previous description

descriptions[#descriptions] = descriptions[#descriptions] .. ' ' .. line:match ('^ (.+)') .. '\"'; -- extract and save the continuation with new quote mark

end

end

return code, table.concat (descriptions, ', '), suppress, deprecated;

end

--[=[------------------------< I A N A _ E X T R A C T >-------------------------------------------------------

read a local copy of the IANA language-subtag-registry file and from it build tables to replace the tables in:

Module:Lang/data/iana languages

Module:Lang/data/iana regions

Module:Lang/data/iana scripts

Module:Lang/data/iana supressed cripts

Module:Lang/data/iana variants

current language-subtag-registry file can be found at: http://www.iana.org/assignments/language-subtag-registry

archive.org has copies of previous versions see: https://web.archive.org/web/*/http://www.iana.org/assignments/language-subtag-registry

]=]

local function iana_extract (frame)

local page = mw.title.getCurrentTitle(); -- get a page object for this page

local content = page:getContent(); -- get unparsed content

local lang_table = {}; -- languages go here

local lang_dep_table = {}; -- deprecated languages go here

local script_table = {}; -- scripts go here

local region_table = {}; -- regions go here

local variant_table = {}; -- variants go here

local suppress_table = {}; -- here we collect suppressed scripts and associated language codes

local iso_639_1_table = {}; -- ISO 639-1 languages; not used by Module:Lang but included here to ensure Module:Lang/data/ISO_639-1 gets updated

local file_date; -- first line

local code;

local descriptions;

local prefixes; -- used for language variants only

local suppress; -- a code's suppress script

local deprecated; -- boolean: true when subtag is deprecated

file_date = content:match ('(File%-Date: %d%d%d%d%-%d%d%-%d%d)'); -- get the file date line from this version of the source file

for record in string.gmatch (content, '%%%%([^%%]+)') do -- get a %% delimited 'record' from the file; leave off the delimiters

local record_type = string.match(record, 'Type: (%w+)')

if record_type == 'language' then -- if a language record

code, descriptions, suppress, deprecated = get_lang_script_region_parts (record); -- get the code, description(s), suppress script, and deprecated flag

if code and ('skip' ~= code) then

if deprecated then

table.insert (lang_dep_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries

else

table.insert (lang_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries

if 2 == code:len() then

table.insert (iso_639_1_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries

end

end

elseif not code then

table.insert (lang_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but inserting an error entry in the final output can be helpful

end

-- here we collect suppress stript tags and their associated language codes;

-- prettigying the data in this table must wait until all language codes have been read

if suppress then -- if this code has a suppressed script

local suppressed_code = table.concat ({'\"', code, '\"'}); -- wrap the code in quotes

if suppress_table[suppress] then -- if there is an entry for this script

table.insert (suppress_table[suppress], suppressed_code); -- insert the new code

else

suppress_table[suppress] = {}; -- add new script and empty table

table.insert (suppress_table[suppress], suppressed_code); -- insert the new code

end

end

elseif record_type == 'script' then -- if a script record

code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s)

if code and ('skip' ~= code) then

table.insert (script_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries

elseif not code then

table.insert (script_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ...

end

elseif record_type == 'region' then -- if a region record

code, descriptions = get_lang_script_region_parts (record); -- get the code and description(s)

if code and ('skip' ~= code) then

table.insert (region_table, "[\"" .. code .. "\"] = {" .. descriptions .. "}"); -- make table entries

elseif not code then

table.insert (region_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ...

end

elseif record_type == 'variant' then -- if a variant record

code, prefixes, descriptions = get_variant_parts (record); -- get the code, prefix(es), and description(s)

if code and ('skip' ~= code) then

table.insert (variant_table,

table.concat ({

"[\"",

code,

"\"] = {
[\"descriptions\"] = {",

descriptions,

"},
[\"prefixes\"] = {",

prefixes,

"},
}"

})

);

elseif not code then

table.insert (variant_table, "[\"error\"] = {" .. record .. "}"); -- code should never be nil, but ...

end

end

end

-- now prettify the supressed script table

local pretty_suppressed = {};

for script, code_tbl in pairs (suppress_table) do

local LIMIT = 11; -- max number of subtags on a line before a line break

local fragment_tbl = {}; -- groups of LIMIT number of subtags collected here

for i=1, #code_tbl, LIMIT do

local stop = ((i+LIMIT-1) > #code_tbl) and #code_tbl or i+LIMIT-1; -- calculate a table.concat stop position

table.insert (fragment_tbl, table.concat (code_tbl, ', ', i, stop)); -- get the fragment and save it

end

table.insert (pretty_suppressed, -- and make all pretty

table.concat ({'[\"', script, '\"] = {', table.concat (fragment_tbl, ',\n\t\t\t\t'), '}'})

);

end

table.sort (pretty_suppressed);

-- make final output pretty

return '

------------------------------< I A N A   L A N G U A G E S >--------------------------------------------------
--' ..

file_date .. "
local active = {
" .. table.concat (lang_table, ',
') .. "
}

" ..

"local deprecated = {
" .. table.concat (lang_dep_table, ',
') .. "
}

" ..

"return {
active = active,
deprecated = deprecated,
}

" ..

'------------------------------< I A N A S C R I P T S >------------------------------------------------------
--' ..

file_date .. "
return {
" .. table.concat (script_table, ',
') .. "
}

" ..

'------------------------------< I A N A R E G I O N S >------------------------------------------------------
--' ..

file_date .. "
return {
" .. table.concat (region_table, ',
') .. "
}

" ..

'------------------------------< I A N A V A R I A N T S >----------------------------------------------------
--' ..

file_date .. "
return {
" .. table.concat (variant_table, ',
') .. "
}

" ..

'------------------------------< I A N A S U P P R E S S E D S C R I P T S >--------------------------------
--' ..

file_date .. "
return {
" .. table.concat (pretty_suppressed, ',
') .. "
}

" ..

'------------------------------< I S O 6 3 9 - 1 >------------------------------------------------------------
--' ..

file_date .. "
return {
" .. table.concat (iso_639_1_table, ',
') .. "
}

" .. "

";

end

--[[--------------------------< E X P O R T E D F U N C T I O N >--------------------------------------------

]]

return {

iana_extract = iana_extract,

}