Module:Sandbox/Trappist the monk/Emoji data make
--[[
this module reads html of https://unicode.org/Public/emoji/latest/emoji-test.txt and creates a data table suitable
for Module:Emoji
1. open https://unicode.org/Public/emoji/latest/emoji-test.txt
2. view page source
3. select and copy the whole html text to clipboard
4. paste into this module's doc page inside the comment markup
5. save
6. copy the rendered table from the module documentation and paste it over the existing table in Module:Emoji/data
{{#invoke:Sandbox/Trappist the monk/Emoji data make|main}}
Is ~/annotations/americas.html the best source? What about:
https://www.unicode.org/emoji/charts/full-emoji-list.html (takes a vey long time to load) – currently v15.1
but: the html source (view source) loads relatively quickly
but: that source is much much much 'longer than the maximum of 2,048 kilobytes'
https://unicode.org/Public/emoji/15.1/emoji-sequences.txt; simple text is good but doesn't provide names for each code
https://unicode.org/Public/emoji/15.1/emoji-test.txt; simple text is good; appears to provide names;
there are duplicates qualified with FE0F as the last subcode; what to do about them? names appear to be
the same so drop the duplicates? date and version can be read from the source
]]
require ('strict');
--[[--------------------------< R E N D E R _ O U T P U T >----------------------------------------------------
render the base table emotbl{} that this module creates.
]]
local function render_output (frame, out_t, timestamp, version)
table.insert (out_t, '\t}'); -- to close the table
table.insert (out_t, 1, table.concat ({ -- insert this at the start of the output sequence
'
string.rep ('\t', 13), -- tabs to position the version/timestamp comment
'-- v.', -- version prefix
version, -- the version
'; ', -- separator
timestamp, -- and the timestamp
}));
return frame:preprocess (table.concat (out_t, '\n')); -- make a big string and done
end
--[[--------------------------< M A I N >----------------------------------------------------------------------
]]
local function main (frame)
local page_title = frame:getTitle() .. '/doc';
local title_object_t = mw.title.new (page_title); -- get the title object for the doc page invoking this module
local content = title_object_t:getContent(); -- get the content of that page
local timestamp = content:match ('# Date: (%d%d%d%d%-%d%d%-%d%d, %d%d:%d%d:%d%d) GMT'); -- get parts of the timestamp
timestamp = timestamp:gsub (',%s+', 'T');
local version = content:match ('# Version: (%d+%.%d+)')
local data_t = {}; -- raw data extracted from source html goes here indexed by emoji hex value(s)
for line in content:gmatch ('([%x ]+;[^\n\r]+)[\n\r]+') do
local hex = line:match ('[%x ]+'); -- one or more hexadecimal strings separated by space characters
hex = mw.text.trim (hex); -- remove extraneous whitespace
hex = hex:gsub (' +FE0F$', ''); -- remove u+FE0F
hex = hex:lower(); -- down case
local emoji = line:match ('# +([^ ]+) '); -- get the emojis for possible use in comment (TODO)
local name = line:match ('E%d+%.%d+ (.+)'); -- get emoji name
name = name:gsub ("'", "\\'"); -- escape ' (U+0027 typewriter apostrophe)
name = mw.ustring.gsub (name, '[“”‘’]', {
['“'] = '\"', -- replace “” (U+201C & U+201D)
['”'] = '\"',
['‘'] = "\\'", -- replace ‘’ (U+2018 & U+2019) with ' (U+0027 typewriter apostrophe)
['’'] = "\\'",
});
name = name:gsub (' +', '_'); -- replace whitespace with single underscore (why?)
name = name:lower(); -- down case
data_t[hex] = {name, emoji}; -- add to the base data list
end
local out_t = {}; -- prettified list goes here
local function tabs (hex, info_t) -- local function to calculate number of tabs needed between end of entry and column 80 comment
local length = 14 + mw.ustring.len (info_t[1]) + string.len (hex); -- length of table entry; ustring.len() because there are some multibyte characters
local white_space = 80 - length; -- comments begin at column 80
local tabs = math.floor (white_space / 4); -- the minimum number of tabs to get to column 80
if 0 ~= math.fmod (white_space, 4) then -- if there is a remainder ...
tabs = tabs + 1; -- add one more tab
end
return ((0 >= tabs) and 1) or tabs; -- return the number the tabs needed to get to column 80; minimum of 1 (for long entries)
end
for hex, info_t in pairs (data_t) do -- spin through data_t and make a prettified list
table.insert (out_t, table.concat ({
'\t[\'', -- indent one tab space; open index
info_t[1], -- add emoji name as index
'\'] = \'', -- close index; add assignment operator; open name
hex, -- add emoji hex value
'\',', -- close name
string.rep ('\t', tabs (hex, info_t)), -- add enough tabs to get to column 80
'-- ', -- start a comment
info_t[2], -- and add the emoji
}));
end
table.sort (out_t); -- ascending sort
return render_output (frame, out_t, timestamp, version); -- make a big string and done
end
--[[--------------------------< E X P O R T S >----------------------------------------------------------------
]]
return {
main = main,
}