Module:Make emoji zwj table

--[[

This module creates an associative table emoji code points that may follow a zero-width joiner character (U+200D).

The module reads a copy of the Unicode Emoji ZWJ Sequences for UTS (typically emoji-zwj-sequences.txt found in

https://unicode.org/Public/emoji/VV.V/ where VV.V is the Unicode version number). The copy of the unicode data

file is held inside html comments in the module's /doc page. From that file, the module extracts pairs of

. The moculde save each unique code point, transformed as necessary to build a new version

of emoji_t for use in Module:Citation/CS1/Configuration.

The module takes one positional parameter:

{{#invoke:make emoji zwj table|main|}}

is the url that matches the Unicode data file. Alas, Lua modules cannot read external data files so

is merely used to document where the data may be found.

Use of this module is documented on its /doc page

]]

require('strict');

local emoji_names_t = { -- keys are hex values from U+xxxx code points

['2194'] = 'left right arrow',

['2195'] = 'up down arrow',

['2620'] = 'skull and crossbones',

['2640'] = 'female sign',

['2642'] = 'male sign',

['2695'] = 'staff of aesculapius',

['2696'] = 'scales',

['26A7'] = 'male with stroke and male and female sign',

['2708'] = 'airplane',

['2744'] = 'snowflake',

['2764'] = 'heavy black heart',

['27A1'] = 'black rightwards arrow',

['2B1B'] = 'black large square',

['1F308'] = 'rainbow',

['1F32B'] = 'fog',

['1F33E'] = 'ear of rice',

['1F373'] = 'cooking',

['1F37C'] = 'baby bottle',

['1F384'] = 'christmas tree',

['1F393'] = 'graduation cap',

['1F3A4'] = 'microphone',

['1F3A8'] = 'artist palette',

['1F3EB'] = 'school',

['1F3ED'] = 'factory',

['1F466'] = 'boy',

['1F467'] = 'girl',

['1F468'] = 'man',

['1F469'] = 'woman',

['1F48B'] = 'kiss mark',

['1F4A5'] = 'collision symbol',

['1F4A8'] = 'dash symbol',

['1F4AB'] = 'dizzy symbol',

['1F4BB'] = 'personal computer',

['1F4BC'] = 'brief case',

['1F525'] = 'fire',

['1F527'] = 'wrench',

['1F52C'] = 'microscope',

['1F5E8'] = 'left speech bubble',

['1F680'] = 'rocket',

['1F692'] = 'fire engine',

['1F7E9'] = 'large green square',

['1F7EB'] = 'large brown square',

['1F91D'] = 'handshake',

['1F9AF'] = 'probing cane',

['1F9B0'] = 'emoji component red hair',

['1F9B1'] = 'emoji component curly hair',

['1F9B2'] = 'emoji component bald',

['1F9B3'] = 'emoji component white hair',

['1F9BA'] = 'safety vest',

['1F9BC'] = 'motorized wheelchair',

['1F9BD'] = 'manual wheelchair',

['1F9D1'] = 'adult',

['1F9D2'] = 'child',

['1FA79'] = 'adhesive bandage',

['1FAF2'] = 'leftwards hand',

}

--[[--------------------------< M A I N >----------------------------------------------------------------------

]]

local function main (frame)

local this_wiki = table.concat ({':', mw.language.getContentLanguage():getCode(), ':'});

local title_obj = mw.title.getCurrentTitle();

local content;

if title_obj.prefixedText:match ('/doc$') then -- if this title object is the ~/doc page (viewing the ~/doc page standalone)

content = title_obj:getContent(); -- get the content

else -- when viewing the module page

content = mw.title.new (table.concat ({title_obj.prefixedText, '/doc'})):getContent(); -- get title object and content for the ~/doc page

end

local code_points_t = {}; -- sequence to hold unique code points that follow U+200D in RGI Emoji ZWJ Sequences in decimal

local out_t = {}; -- final output goes here

local new_emoji_names_t = {}; -- used to update emoji_names_t in this module

local tabs_15 = string.rep ('\t', 15); -- for six-digit keys

local tabs_16 = string.rep ('\t', 16); -- for keys that have fewer than six digits

local file_date = content:match ('# *Date: *(%d%d%d%d%-%d%d%-%d%d)'); -- file date of the Unicode source

local file_version = content:match ('# *Version: *([%d%.]+)'); -- version of the Unicode source

for code_point in content:gmatch ('200D (%x+)') do -- find each pair

local code_point_dec = tonumber ('0x' .. code_point); -- convert hex code point to decimal for output table key

if not code_points_t[code_point] then -- if we have not seen this before

code_points_t[code_point] = true; -- remember that we have now seen this

table.insert (out_t, table.concat ({ -- build a line for this code point

'\t[', -- open key markup

code_point_dec, -- in decimal

'] = true,', -- close key and assign it the value 'true'

(100000 <= code_point_dec) and tabs_15 or tabs_16, -- insert a bunch of tabs between the k/v pair and an associated comment

'-- U+', -- start the comment; prefix for the hex

code_point, -- add the

' &#x', -- hex html entity prefix for

code_point, -- add the

'; ', -- finish the html entity

emoji_names_t[code_point] and emoji_names_t[code_point] or '', -- if we have a name for this code point, add the name; empty string else

}));

table.insert (new_emoji_names_t, table.concat ({ -- build a line for this code point

'\t[\'', -- open key markup

code_point, -- in hex

'\'] = \'', -- close key, open quote mark and ready to assign it a name

emoji_names_t[code_point] and emoji_names_t[code_point] or '', -- if we have a name for this code point, add the name; empty string else

'\',', -- add closing quote mark and terminal comma

}));

end

end

local function compare_dec (a, b) -- local compare function for decimal table.sort() ascending

a = a:match ('%[(%d+)%]'); -- extract decimal key text

b = b:match ('%[(%d+)%]');

return tonumber (a) < tonumber (b); -- convert key text to numbers and compare

end

local function compare_hex (a, b) -- local compare function for hexadecimal table.sort() ascending

a = a:match ('%[\'(%x+)\'%]'); -- extract hexadecimal key text

b = b:match ('%[\'(%x+)\'%]');

a = table.concat ({'0x', a}); -- make a hex string

b = table.concat ({'0x', b});

return tonumber (a) < tonumber (b); -- convert hex key text todecimal numbers and compare

end

table.sort (out_t, compare_dec); -- ascending numerical sort on decimal keys

local prefix_t = {}; -- build a prefix for this version of the table

table.insert (prefix_t, '==emoji_t==');

table.insert (prefix_t, 'use this table to overwrite same-named table in Module:Citation/CS1/Configuration/sandbox');

table.insert (prefix_t, '

-- list of emoji that use a zwj character (U+200D) to combine with another emoji');

table.insert (prefix_t, table.concat ({'-- from: ', frame.args[1], '; version: ', file_version, '; ', file_date}));

table.insert (prefix_t, table.concat ({'-- table created by: ', this_wiki, title_obj.nsText, ':', title_obj.baseText, ''}));

table.insert (prefix_t, table.concat ({'local emoji_t = {', tabs_16, '-- indexes are decimal forms of the hex values in U+xxxx'}));

table.insert (out_t, 1, table.concat (prefix_t, '\n')); -- insert at the head of the output table

table.insert (out_t, '\t}

'); -- close the
 tag

table.sort (new_emoji_names_t, compare_hex); -- ascending numerical sort on hexadecimal keys

table.insert (out_t, '==emoji_names_t==');

table.insert (out_t, table.concat ({'use this table to overwrite same-named table in ', this_wiki, title_obj.nsText, ':', title_obj.baseText, '; add missing names.'}));

table.insert (out_t, table.concat ({'\n

local emoji_names_t = {', tabs_15, '-- keys are hex values from U+xxxx code points'}));

for _, v in ipairs (new_emoji_names_t)do

table.insert (out_t, v);

end

table.insert (out_t, '\t}

'); -- close the
 tag

return frame:preprocess (table.concat (out_t, '\n')); -- make a big string and done

end

--[[--------------------------< E X P O R T S >----------------------------------------------------------------

]]

return {

main = main,

}