Module:Sandbox/Trappist the monk/Emoji data make

--[[

this module reads html of https://unicode.org/Public/emoji/latest/emoji-test.txt and creates a data table suitable

for Module:Emoji

1. open https://unicode.org/Public/emoji/latest/emoji-test.txt

2. view page source

3. select and copy the whole html text to clipboard

4. paste into this module's doc page inside the comment markup

5. save

6. copy the rendered table from the module documentation and paste it over the existing table in Module:Emoji/data

{{#invoke:Sandbox/Trappist the monk/Emoji data make|main}}

Is ~/annotations/americas.html the best source? What about:

https://www.unicode.org/emoji/charts/full-emoji-list.html (takes a vey long time to load) – currently v15.1

but: the html source (view source) loads relatively quickly

but: that source is much much much 'longer than the maximum of 2,048 kilobytes'

https://unicode.org/Public/emoji/15.1/emoji-sequences.txt; simple text is good but doesn't provide names for each code

https://unicode.org/Public/emoji/15.1/emoji-test.txt; simple text is good; appears to provide names;

there are duplicates qualified with FE0F as the last subcode; what to do about them? names appear to be

the same so drop the duplicates? date and version can be read from the source

]]

require ('strict');

--[[--------------------------< R E N D E R _ O U T P U T >----------------------------------------------------

render the base table emotbl{} that this module creates.

]]

local function render_output (frame, out_t, timestamp, version)

table.insert (out_t, '\t}'); -- to close the table

table.insert (out_t, 1, table.concat ({ -- insert this at the start of the output sequence

'local emoji_hex_from_name_t = {', -- opening stuff

string.rep ('\t', 13), -- tabs to position the version/timestamp comment

'-- v.', -- version prefix

version, -- the version

'; ', -- separator

timestamp, -- and the timestamp

}));

return frame:preprocess (table.concat (out_t, '\n')); -- make a big string and done

end

--[[--------------------------< M A I N >----------------------------------------------------------------------

]]

local function main (frame)

local page_title = frame:getTitle() .. '/doc';

local title_object_t = mw.title.new (page_title); -- get the title object for the doc page invoking this module

local content = title_object_t:getContent(); -- get the content of that page

local timestamp = content:match ('# Date: (%d%d%d%d%-%d%d%-%d%d, %d%d:%d%d:%d%d) GMT'); -- get parts of the timestamp

timestamp = timestamp:gsub (',%s+', 'T');

local version = content:match ('# Version: (%d+%.%d+)')

local data_t = {}; -- raw data extracted from source html goes here indexed by emoji hex value(s)

for line in content:gmatch ('([%x ]+;[^\n\r]+)[\n\r]+') do

local hex = line:match ('[%x ]+'); -- one or more hexadecimal strings separated by space characters

hex = mw.text.trim (hex); -- remove extraneous whitespace

hex = hex:gsub (' +FE0F$', ''); -- remove u+FE0F

hex = hex:lower(); -- down case

local emoji = line:match ('# +([^ ]+) '); -- get the emojis for possible use in comment (TODO)

local name = line:match ('E%d+%.%d+ (.+)'); -- get emoji name

name = name:gsub ("'", "\\'"); -- escape ' (U+0027 typewriter apostrophe)

name = mw.ustring.gsub (name, '[“”‘’]', {

['“'] = '\"', -- replace “” (U+201C & U+201D)

['”'] = '\"',

['‘'] = "\\'", -- replace ‘’ (U+2018 & U+2019) with ' (U+0027 typewriter apostrophe)

['’'] = "\\'",

});

name = name:gsub (' +', '_'); -- replace whitespace with single underscore (why?)

name = name:lower(); -- down case

data_t[hex] = {name, emoji}; -- add to the base data list

end

local out_t = {}; -- prettified list goes here

local function tabs (hex, info_t) -- local function to calculate number of tabs needed between end of entry and column 80 comment

local length = 14 + mw.ustring.len (info_t[1]) + string.len (hex); -- length of table entry; ustring.len() because there are some multibyte characters

local white_space = 80 - length; -- comments begin at column 80

local tabs = math.floor (white_space / 4); -- the minimum number of tabs to get to column 80

if 0 ~= math.fmod (white_space, 4) then -- if there is a remainder ...

tabs = tabs + 1; -- add one more tab

end

return ((0 >= tabs) and 1) or tabs; -- return the number the tabs needed to get to column 80; minimum of 1 (for long entries)

end

for hex, info_t in pairs (data_t) do -- spin through data_t and make a prettified list

table.insert (out_t, table.concat ({

'\t[\'', -- indent one tab space; open index

info_t[1], -- add emoji name as index

'\'] = \'', -- close index; add assignment operator; open name

hex, -- add emoji hex value

'\',', -- close name

string.rep ('\t', tabs (hex, info_t)), -- add enough tabs to get to column 80

'-- ', -- start a comment

info_t[2], -- and add the emoji

}));

end

table.sort (out_t); -- ascending sort

return render_output (frame, out_t, timestamp, version); -- make a big string and done

end

--[[--------------------------< E X P O R T S >----------------------------------------------------------------

]]

return {

main = main,

}