Module:Sandbox/DePiep/uchar

-- todo split consist Char and Args

-- todo cwith double dotcircle 230/239, 233, 234

-- thought: option "speccial notes", listing: "whitesace, control, combining, NaC, .."

require( 'strict' )

local p = {}

local getArgs = require( 'Module:Arguments' ).getArgs

local uChar_data = mw.loadData( 'Module:Sandbox/DePiep/uchar/data' )

local uData = require('Module:Unicode data')

local uData_helper = require('Module:Sandbox/DePiep/uchar-helper')

local uBaseConvert = require('Module:BaseConvert')

local yesno = require('Module:Yesno')

local str = require('Module:String')

local plaintext = require('Module:Plain text')

--- local tabletools = require('Module:TableTools')

local ERRstatus = ''

local tUchar = {}

local DOTTED_CIRCLE = '◌' -- U+25CC

local NBSP = ' ' -- U+00A0  

local LEFT_TO_RIGHT_MARK = '‎' -- U+200E LEFT-TO-RIGHT MARK (‎)

local DEFAULT_IMAGE_SIZE = '21px'

local WS_BLUE = 'lightblue'

local function testH( s )

local h = mw.html.create('span')

h

:attr('id', 'testH')

:tag('big')

:css('background', WS_BLUE)

:wikitext( s )

--:newline()

return tostring(h)

end

local function addStyles( tChar )

local h = mw.html.create('span')

h

:attr('id', 'testH')

:css('font-size', '150%')

:wikitext( tChar.uChar )

if tChar.uIsWhitespace == true then

h:css('background', WS_BLUE)

end

--:newline()

return tostring(h)

end

function p.testH( frame )

local origArgs = getArgs( frame )

return testH( origArgs[1] )

end

function p.testFromDoc(frame)

local div = mw.html.create( 'div' )

div

:attr( 'id', 'testdiv' )

:css( 'width', '100%' )

:wikitext( 'Some text' )

:tag( 'hr' )

return tostring( div )

-- Output:

Some text

end

-- FORMATTERS ===== ===== ===== ===== ===== ===== ===== =====

local function inTag( s, arg, val, divspan )

local obj

local rprt = ''

if divspan == 'div' or divspan == 'span' then

else

return nil -- ERR

end

return s, rprt

end

local function decodeString( s )

if s == nil then return nil end

return mw.text.decode( s )

end

-- Format string in tag / from m:str find word

-- replaces whitespace by single nbsp ( keep untrimmed ws visible )

local function inCode( s )

if s == nil then return '' end

s = string.gsub( s, '%s+', ' ' )

return '' .. s .. ''

end

-- Use mono font-family ( from: Template:Mono )

local function inMono( s )

if s == nil then s = '' end

s = string.gsub( s, '%s+', ' ' )

return '' .. s .. ''

end

local function inSmallcaps( s )

if ( s == nil ) or ( s == ) then return end

-- ''

-- Smallcaps/styles.css: span.smallcaps {font-variant: small-caps;}

local sc

-- sc = ''

sc = '' .. s .. ''

return sc

end

local function xlLinkFileFormat( uHexBare0x, uHexFormat, sGenCat )

-- depending on parameter used, xlink one of two

if uHexBare0x ~= nil then -- Character data page

-- https://www.fileformat.info/info/unicode/char/00ad/index.htm (or "/ad/"); no 0x no uc

return '[https://www.fileformat.info/info/unicode/char/' .. string.lower( uHexBare0x ) .. '/index.htm ff.info '

.. uHexFormat .. ']'

else -- GenCat list, for example gencat "Nd":

-- https://www.fileformat.info/info/unicode/category/Nd/list.htm

return '[https://www.fileformat.info/info/unicode/category/' .. sGenCat .. '/list.htm ff.info '

.. sGenCat .. ']'

end

end

-- UHEX HANDLERS & FORMATTERS ----- ----- ----- ----- ----- ----- ----- ----- -----

local function formatUhex( uHex0x, uLink )

-- formatting into normalform "U+00A9"

local uHexFmt -- working

uHexFmt = string.gsub( uHex0x, '^0x', '' )

uHexFmt = string.gsub( uHexFmt, '^0*', '' )

uHexFmt = 'U+' .. string.sub( '0000' .. uHexFmt, - math.max( #uHexFmt, 4 ) )

if uLink ~= nil then

return uHexFmt .. '_[todo: fmt Uhex_link_U+]'

end

return uHexFmt

end

local function formatGenCat( sGenCat, fmt )

local tCat

tCat = uChar_data.tGenCat[sGenCat]

if tCat == nil then return '' end

return inMono(sGenCat) .. '=' .. tCat[1]

end

-- Formats table ( array ) using concat

-- replace space by nbsp ( keep untrimmed sp )

-- in monospace font-family

local function formatTablelist( t ) -- unused?

local s = ''

if t == nil then return '' end

s = table.concat( t, '; ' )

s = mw.text.decode( string.gsub( s, '%s+', ' ' ) )

s = '<' .. inMono( s ) .. '>'

return s

end

local function formatCombiningChar( is_combining, cWith )

local addPrefix

local uCombWith -- working, cWith logic

local rprt

-- todo need 4-way logic for cwith

cWith = decodeString( cWith )

rprt = 'is_combi: ' .. tostring( is_combining ) .. '; cwith: ' .. tostring( cWith )

-- strip wikicode; but save NBSP -- todo improve, test

if cWith ~= nil then

cWith = string.gsub( cWith, NBSP, 'NBSP' )

cWith = plaintext._main( cWith, false )

cWith = string.gsub( cWith, 'NBSP', NBSP)

end

uCombWith = yesno( cWith ) -- y/n/nil (3-way logic; 'foo' == nil)

addPrefix = ''

if (cWith == nil) or (uCombWith == true) then -- default: per is_combining

rprt = rprt .. '_dflt non-combi = none'

if is_combining == true then

addPrefix = DOTTED_CIRCLE

rprt = rprt .. '_dflt'

end

elseif uCombWith == false then -- explicitly false, so suppress

addPrefix = ''

rprt = rprt .. '_false, suppress'

else -- use character provided by cwith

addPrefix = cWith

rprt = rprt .. '_cleanchar: ' .. tostring( cWith )

end

return addPrefix, rprt

end

-- READ & PROCESS ==== ====== ===== ===== ===== ===== ===== =====

local function convertHexInToHex0x( uHexAnyform )

local uHexBare0x

local uHex0x -- targets

local uHexNum

local uHexFormat

if ( uHexAnyform == nil ) or ( uHexAnyform == '' ) then

ERRstatus ='ERR convertHexInToHex0x: no uHex input'

return nil

end

uHexBare0x = decodeString( uHexAnyform )

uHexBare0x = string.gsub( uHexBare0x, '%s', '' )

uHexBare0x = string.gsub( uHexBare0x, '^U%+', '' )

uHexBare0x = string.gsub( uHexBare0x, '^0x', '' )

uHexBare0x = string.upper( uHexBare0x )

uHex0x = '0x' .. uHexBare0x

-- number check

uHexNum = tonumber( uHex0x ) -- kills NaN, todo: test this

if uHexNum == nil then

ERRstatus ='ERR convertHexInToHex0x: uHex is not hex: >' .. tostring( uHexNum ) .. '<'

return nil

elseif ( uHexNum < 0 ) or ( uHexNum > 0x10FFFF ) then

ERRstatus ='ERR convertHexInToHex0x: uHex out of U+ range' .. uHex0x

return nil

end

uHexFormat = formatUhex( uHex0x )

return uHex0x, uHexNum, uHexBare0x, uHexFormat

end

local function convertHexToDec( uHex0x )

local xVal

if uHex0x == nil then return nil end

xVal = uBaseConvert.convert( {n = uHex0x, base = 10, from = 16} )

return xVal

end

local function convertDecToHex( uDec )

-- todo: dec input is NaN, err, edge

if uDec == nil then return nil end

return uBaseConvert.convert( {n = tonumber( uDec, 10 ), base = 16, from = 10} )

end

-- GET DATA ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ====

local function getBlock( uHexNum )

uData.lookup_block( uHexNum )

return 'blck'

end

local function getPlane( uHexNum )

local i = math.floor(uHexNum / 0x10000)

return i .. ': ' .. uChar_data.tPlanes[i]

end

local function getCombiningClass( uHex0x )

-- CCC

-- todo: 239 (230), 233, 234 = between spacing chars.

local ccc

ccc = uData_helper.lookup_combiningclass( uHex0x ) or '' -- new -helper function

return ccc

end

local function getNamedEntities( uDec, fmt )

-- returns from datalist, by decimal val:

-- formatted into concat.table list

-- demo: [168]='&uml;, &die;, &Dot;, &DoubleDot;'

local tNamedEntitiesData = mw.loadData( 'Module:Numcr2namecr' )

local sNameList

local tNames= {}

---- uDec=169

-- fmt = report

-- id = decimal input

sNameList = tNamedEntitiesData[tonumber(uDec)]

if sNameList == nil then return nil end

sNameList = decodeString( sNameList ) -- has literal '&' in source

local patstring = '%f[^&][^%;]+%f[%;]'

local hitCount = 0

local hitWord = ''

while hitCount <= 20 do

hitCount = hitCount + 1

hitWord = str._match( sNameList, patstring, 1, hitCount, false, '' )

hitWord = mw.text.trim( hitWord )

if hitWord ~= '' then

table.insert( tNames, inMono( '&' .. hitWord .. ';' ) )

elseif hitWord == '' then

-- no more hits in the string

break

end

end

return table.concat( tNames, '  ' ) -- double spaced

end

local function getAliases( uHex )

-- returns t5{} = 5 alias tables named by reason

-- demo 0x002118 = weier

local tAllAliases = mw.loadData( 'Module:Unicode data/aliases' )

local tCPalias = {}

tCPalias = tAllAliases[uHex]

if tCPalias == nil then return nil end

-- for 2-deep 5-subtable ( Aliases )

local tAlias5 = {}

local abbreviation = {}

local alternate = {}

local correction = {}

local control = {}

local figment = {}

tAlias5["abbreviation"] = abbreviation

tAlias5["alternate"] = alternate

tAlias5["control"] = control

tAlias5["correction"] = correction

tAlias5["figment"] = figment

for i, v in ipairs( tCPalias ) do

-- i = counter, v[i] = table (1/5), v[2] = tablename ( alias, 1/5 )

if type( v ) == 'table' then

table.insert( tAlias5[v[1]], v[2] )

end

end

return tAlias5

end

local function getScriptName( sScriptISO )

local sName

local UDscripts = mw.loadData( 'Module:Unicode data/scripts' )

if sScriptISO == nil then return nil end

sName = UDscripts.aliases[sScriptISO] or nil

if sName == nil then

sName = '_unk'

end

return sName

end

local function formatAlias5( t5Alias, fmt )

local sReport

if t5Alias == nil then return nil end

-- fmt = report

sReport = '
ALIASES: '

for k, v in pairs( t5Alias ) do

if #v > 0 then

sReport = sReport .. ' ' .. k .. ': ' .. table.concat( v, '; ' )

end

end

return sReport

end

-- 1. PARSE INCOMING ARGS

-- 2. READ PROPERTIES

local function getArgsAndProps( origArgs )

local tNewArgs = {}

local inHex, inDec, inChar = 1, 2, 3 -- 'inHex', 'inDec', 'inChar'

local tOrigIn = { inHex=nil, inDec=nil, inChar=nil }

local uHexIn = -1 -- the base input

local uHex0x, uHexNum -- local working val

--xx

-- PART 1 READ & NORMALISE ORIG ARGUMENTS

-- HEX DEC CHAR

local rprt = 'R-t0:' .. #tOrigIn

tOrigIn[inHex] = (origArgs[1] or origArgs['hex']) or nil -- todo: split for check?

tOrigIn[inDec] = origArgs['dec'] or nil

tOrigIn[inChar] = decodeString( origArgs['char'] ) or nil

rprt = rprt .. ' R-t2:' .. #tOrigIn

for n, v in pairs( tOrigIn ) do

if v ~= nil then

rprt = rprt .. ' ' .. tostring(v) .. ';;'

end

end

if tOrigIn[inDec] ~= nil then

uHexIn = convertDecToHex( tOrigIn[inDec] )

rprt = rprt .. ' dec;'

end

if tOrigIn[inChar] ~= nil then

uHexIn = convertDecToHex( mw.ustring.codepoint( tOrigIn[inChar] ))

rprt = rprt .. ' char;'

end

if tOrigIn[inHex] ~= nil then

uHexIn = tOrigIn[inHex]

rprt = rprt .. ' hex;'

end

-- REPORT todo: what if >1 input?: err msg, prio, conflictcheck

-- 2023-02-04: removed "\|" "invalid escape sequence" ???

tNewArgs['rprtOrigIDs'] = ' |ID in: #t4=' .. #tOrigIn .. ':>' .. rprt .. tostring(uHexIn) .. '<| '

-- returns: uHex0x, uHexNum, uHexBare0x, uHexFormat

tNewArgs['uHex0x'], tNewArgs['uHexNum'], tNewArgs['uHexBare0x'], tNewArgs['uHexFormat'] = convertHexInToHex0x( uHexIn )

if tNewArgs['uHex0x'] == nil then -- ERROR

-- shortcut to error #1: no uHex (valid 0x) input

return tNewArgs

end

-- local shortcut only

uHex0x = tNewArgs['uHex0x']

uHexNum = tNewArgs['uHexNum']

-- DEC

tNewArgs['uDec'] = convertHexToDec( uHex0x )

-- OTHER ORIG ARGS

tNewArgs['uNameLink'] = origArgs['link'] or origArgs['nlink'] -- old nlink = depr paramname

tNewArgs['format'] = origArgs['format'] or ''

tNewArgs['cwith'] = decodeString( origArgs['cwith'] )

tNewArgs['uSize'] = origArgs['size']

tNewArgs['uImage'] = origArgs['image']

tNewArgs['html'] = origArgs['html'] -- depr?

tNewArgs['ulink'] = origArgs['ulink'] -- old ulink = depr?

-- test notice

tNewArgs['test'] = origArgs['test'] or ''

-- PART 2 READ & USE PROPERTIES == == == == == == == == == == == == == == == == == == == == == == == ==

-- ASSIGNED, GenCat, Control, Char

tNewArgs['uIsAssigned'] = uData.is_assigned( uHexNum )

if tNewArgs['uIsAssigned'] == true then

tNewArgs['uGenCat'] = uData.lookup_category( uHexNum )

tNewArgs['uChar'] = mw.text.decode( '&#x' .. tNewArgs['uHex0x'] .. ';' )

else

tNewArgs['uGenCat'] = 'Xx' -- todo not assigned == ?

tNewArgs['uChar'] = 'ERR_not_assg' -- ERROR

end

tNewArgs['uBlock'] = uData.lookup_block( uHexNum )

tNewArgs['uPlane'] = getPlane( uHexNum )

-- CHAR replacement

if tNewArgs['uGenCat'] == 'Cc' then

tNewArgs['uChar'] = '�' -- '?' placeholder

end

if tNewArgs['uGenCat'] == 'Cc' then -- assuming this is 1:1

tNewArgs['uIsControl'] = true

else

tNewArgs['uIsControl'] = false

end

--NAME, ALIASES

tNewArgs['uName'] = uData.lookup_name( uHexNum )

tNewArgs['Aliases'] = getAliases( uHexNum ) -- table5

--PROPS Script, Latin, WS

tNewArgs['uIsLatin'] = uData.is_Latin( tostring( tNewArgs['uChar'] ) )

tNewArgs['uScript'] = uData.lookup_script( uHexNum )

tNewArgs['uScriptName'] = getScriptName( tNewArgs['uScript'] )

tNewArgs['uIsWhitespace'] = uData.is_whitespace( uHexNum )

--PROPS rtl

tNewArgs['uIsRtl'] = uData.is_rtl( tostring( tNewArgs['uChar'] ) )

--PROPS2 COMBINING PREFIX Combining/cwith/dottedcircle, CCC

tNewArgs['uIsCombining'] = uData.is_combining( uHexNum ) or false

if yesno( tNewArgs['uIsCombining'], false ) == true then -- todo: could do: read ccc, once ;-)

tNewArgs['uCombiningClass'] = getCombiningClass( uHexNum )

end

tNewArgs['uCombiningClass'] = getCombiningClass( uHexNum )

tNewArgs['uCharPrefix'], tNewArgs['uCwithReport'] = formatCombiningChar( tNewArgs['uIsCombining'], tNewArgs['cwith'] )

-- CHAR SUFFFIX; rtl

if tNewArgs['uIsRtl'] == true then

tNewArgs['uCharSuffix'] = LEFT_TO_RIGHT_MARK

else

tNewArgs['uCharSuffix'] = ''

end

--PROPS3: NamedEntities

tNewArgs['NamedEntities'] = getNamedEntities( convertHexToDec( uHex0x ) )

return tNewArgs

end

function p._main ( args )

return '_todo _main'

end

function p.main ( frame )

local origArgs = getArgs( frame, { trim=false, removeBlanks=false } )

local tArgs = {}

local s = ''

tUchar = getArgsAndProps( origArgs )

if tUchar['uHex0x'] == nil then

return ' >' .. ( origArgs[1] or '?' ) .. '< ERR hexIn ' .. ERRstatus .. ' ' .. (tUchar['rprtOrigIDs'] or 'unk1')

end

-- REPORT RPRT

s = formatUhex( tUchar['uHex0x'] )

--string together & css format

tUchar.uChar = tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] -- cwith, rtl,

--- tUchar['styledChar'] = addStyles( tUchar )

local cssChar

cssChar = addStyles( tUchar )

if tUchar['uImage'] ~= nil then

s = s .. ' ' .. ( tUchar['uSize'] or DEFAULT_IMAGE_SIZE ) .. ' '

else

--s = s .. ' ' .. tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] .. ' '

s = s .. ' ' .. cssChar .. ' '

end

s = s .. inSmallcaps( tUchar['uName'] )

s = s .. '
[testing: ' .. tUchar['test'] .. ']' .. (tUchar['rprtOrigIDs'] or '?') .. '→ '

.. tUchar['uHex0x'] .. ' [' .. tUchar['uDec'] .. 'dec]'.. '; (' .. xlLinkFileFormat( tUchar['uHexBare0x'], tUchar['uHexFormat'] ) .. ') '

.. 'GC: ' .. formatGenCat( tUchar['uGenCat'] ) .. ' (' .. xlLinkFileFormat( nil, nil, tUchar['uGenCat'] ) .. ')'

.. '
ASSIG: ' .. tostring( tUchar['uIsAssigned'] ) .. '; '

.. 'WS: '.. tostring( tUchar['uIsWhitespace'] )

.. '
BLK: ' .. tUchar['uBlock'] .. '; PLANE: ' .. tUchar['uPlane'] .. '; '

.. '
SC: ' .. tUchar['uScript'] .. '=' .. tUchar['uScriptName'] .. '; RTLsuffix:' .. tostring( tUchar['uIsRtl'] ) .. '; '

s = s .. '
COMBI PREFIX: >' .. tUchar['uCharPrefix'] .. '<; ' .. tUchar['uCwithReport']

.. '; CCC class:' .. ( tUchar['uCombiningClass'] or '-' )

if tUchar['NamedEntities'] ~= nil then

s = s .. '
NAMED ENTITIES: ' .. tUchar['NamedEntities']

end

if tUchar['Aliases'] ~= nil then

s = s .. formatAlias5( tUchar['Aliases'], 'report' )

end

return s

end

function p.test(frame)

local sChar

sChar = frame.args['char']

return mw.ustring.codepoint(sChar, 1, 2)

end

function p.testScriptName( frame )

local sISOid

sISOid = frame.args[1]

return getScriptName(sISOid)

end

return p