Module:Sandbox/PHansen/URLutil

-- From :de:Modul:URLutil

-- Via :en:User:PHansen/URLutil

-- Descriptions

-- en: :de:Wikipedia:Lua/Modul/URLutil/en

-- de: :de:Wikipedia:Lua/Modul/URLutil/de

-- Test : :de:Wikipedia:Lua/Modul/URLutil/Test

-- Wikidata: :d:Q10859193

--[=[ URLutil 2014-09-20

Utilities for URL etc. on www.

  • getAuthority()
  • getFragment()
  • getHost()
  • getLocation()
  • getPath()
  • getPort()
  • getQuery()
  • getQueryTable()
  • getRelativePath()
  • getScheme()
  • getTLD()
  • getTop2domain()
  • getTop3domain()
  • isAuthority()
  • isDomain()
  • isDomainExample()
  • isDomainInt()
  • isHost()
  • isIP()
  • isIPlocal()
  • isIPv4()
  • isIPv6()
  • isMailAddress()
  • isMailLink()
  • isProtocolDialog()
  • isProtocolWiki()
  • isResourceURL()
  • isSuspiciousURL()
  • isUnescapedURL()
  • isWebURL()
  • wikiEscapeURL()

Only dotted decimal notation for IPv4 supported.

Does not support dotted hexadecimal, dotted octal, or single-number formats.

IPv6 URL (bracketed) not yet implemented; might need Wikintax escaping anyway.

]=]

-- table for export

local URLutil = {}

URLutil.getURIScheme = function ( uri )

if type( uri ) == "string" then

local prot, colon, slashes = uri:match( "^%s*([a-zA-Z]*)(:?)(/?/?)" )

if #colon == 1 and #prot >= 2 then

return prot:lower()

elseif #slashes == 2 and #prot == 0 then

return "//"

end

end

return false

end -- getURIScheme()

local getTopDomain = function ( url, mode )

local r = URLutil.getHost( url )

if r then

local pattern = "[%w%%]+%.%a[%w-]*%a)$"

if mode == 3 then

pattern = "[%w%%]+%." .. pattern

end

r = mw.ustring.match( "." .. r, "%.(" .. pattern )

if not r then

r = false

end

else

r = false

end

return r

end -- getTopDomain()

URLutil.getAuthority = function ( url )

local r

if type( url ) == "string" then

local colon, host, port

local pattern = "^%s*%w*:?//([%w%.%%-]+)(:?)([%d]*)/"

local s = mw.text.decode( url )

local i = s:find( "#", 6, true )

if i then

s = s:sub( 1, i - 1 ) .. "/"

else

s = s .. "/"

end

host, colon, port = mw.ustring.match( s, pattern )

if URLutil.isHost( host ) then

host = mw.ustring.lower( host )

if colon == ":" then

if port:find( "^[1-9]" ) then

r = ( host .. ":" .. port )

end

elseif #port == 0 then

r = host

end

end

else

r = false

end

return r

end -- URLutil.getAuthority()

URLutil.getFragment = function ( url, decode )

local r

if type( url ) == "string" then

local s = mw.text.decode( url )

local i = s:find( "#", 1, true )

if i then

r = mw.text.trim( s:sub( i ) ):sub( 2 )

if type( decode ) == "string" then

local encoding = mw.text.trim( decode )

local launch

if encoding == "%" then

launch = true

elseif encoding == "WIKI" then

r = r:gsub( "%.(%x%x)", "%%%1" )

:gsub( "_", " " )

launch = true

end

if launch then

r = mw.uri.decode( r, "PATH" )

end

end

else

r = false

end

else

r = nil

end

return r

end -- URLutil.getFragment()

URLutil.getHost = function ( url )

local r = URLutil.getAuthority( url )

if r then

r = mw.ustring.match( r, "^([%w%.%%-]+):?[%d]*$" )

end

return r

end -- URLutil.getHost()

URLutil.getLocation = function ( url )

local r

if type( url ) == "string" then

r = mw.text.trim( url )

if r == "" then

r = false

else

local i

r = mw.text.decode( r )

i = r:find( "#", 1, true )

if i then

if i == 1 then

r = false

else

r = r:sub( 1, i - 1 )

end

end

end

else

r = nil

end

return r

end -- URLutil.getLocation()

URLutil.getPath = function ( url )

local r = URLutil.getRelativePath( url )

if r then

local s = r:match( "^([^%?]*)%?" )

if s then

r = s

end

s = r:match( "^([^#]*)#" )

if s then

r = s

end

end

return r

end -- URLutil.getPath()

URLutil.getPort = function ( url )

local r = URLutil.getAuthority( url )

if r then

r = r:match( ":([1-9][0-9]*)$" )

if r then

r = tonumber( r )

else

r = false

end

end

return r

end -- URLutil.getPort()

URLutil.getQuery = function ( url, key, separator )

local r = URLutil.getLocation( url )

if r then

r = r:match( "^[^%?]*%?(.+)$" )

if r then

if type( key ) == "string" then

local single = mw.text.trim( key )

local sep = "&"

local s, scan

if type( separator ) == "string" then

s = mw.text.trim( separator )

if s:match( "^[&;,/]$" ) then

sep = s

end

end

s = string.format( "%s%s%s", sep, r, sep )

scan = string.format( "%s%s=([^%s]*)%s",

sep, key, sep, sep )

r = s:match( scan )

end

end

if not r then

r = false

end

end

return r

end -- URLutil.getQuery()

URLutil.getQueryTable = function ( url, separator )

local r = URLutil.getQuery( url )

if r then

local sep = "&"

local n, pairs, s, set

if type( separator ) == "string" then

s = mw.text.trim( separator )

if s:match( "^[&;,/]$" ) then

sep = s

end

end

pairs = mw.text.split( r, sep, true )

n = #pairs

r = { }

for i = 1, n do

s = pairs[ i ]

if s:find( "=", 2, true ) then

s, set = s:match( "^([^=]+)=(.*)$" )

if s then

r[ s ] = set

end

else

r[ s ] = false

end

end -- for i

end

return r

end -- URLutil.getQueryTable()

URLutil.getRelativePath = function ( url )

local r

if type( url ) == "string" then

local s = url:match( "^%s*[a-zA-Z]*://(.*)$" )

if s then

s = s:match( "[^/]+(/.*)$" )

else

local x

x, s = url:match( "^%s*(/?)(/.*)$" )

if x == "/" then

s = s:match( "/[^/]+(/.*)$" )

end

end

if s then

r = mw.text.trim( s )

elseif URLutil.isResourceURL( url ) then

r = "/"

else

r = false

end

else

r = nil

end

return r

end -- URLutil.getRelativePath()

URLutil.getScheme = function ( url )

local r

if type( url ) == "string" then

local pattern = "^%s*([a-zA-Z]*)(:?)(//)"

local prot, colon, slashes = url:match( pattern )

r = false

if slashes == "//" then

if colon == ":" then

if #prot > 2 then

r = prot:lower() .. "://"

end

elseif #prot == 0 then

r = "//"

end

end

else

r = nil

end

return r

end -- URLutil.getScheme()

URLutil.getTLD = function ( url )

local r = URLutil.getHost( url )

if r then

r = mw.ustring.match( r, "[%w]+%.(%a[%w-]*%a)$" )

if not r then

r = false

end

end

return r

end -- URLutil.getTLD()

URLutil.getTop2domain = function ( url )

return getTopDomain( url, 2 )

end -- URLutil.getTop2domain()

URLutil.getTop3domain = function ( url )

return getTopDomain( url, 3 )

end -- URLutil.getTop3domain()

URLutil.isAuthority = function ( s )

local r

if type( s ) == "string" then

local pattern = "^%s*([%w%.%%-]+)(:?)(%d*)%s*$"

local host, colon, port = mw.ustring.match( s, pattern )

if colon == ":" then

port = port:match( "^[1-9][0-9]*$" )

if type( port ) ~= "string" then

r = false

end

elseif port ~= "" then

r = false

end

r = URLutil.isHost( host )

else

r = nil

end

return r

end -- URLutil.isAuthority()

URLutil.isDomain = function ( s )

local r

if type( s ) == "string" then

local scan = "^%s*([%w%.%%-]+%w)%.(%a[%w-]*%a)%s*$"

local scope

s, scope = mw.ustring.match( s, scan )

if type( s ) == "string" then

if mw.ustring.find( s, "^%w" ) then

if mw.ustring.find( s, "..", 1, true ) then

r = false

else

r = true

end

end

end

else

r = nil

end

return r

end -- URLutil.isDomain()

URLutil.isDomainExample = function ( url )

-- RFC 2606: example.com example.net example.org example.edu

local r = getTopDomain( url, 2 )

if r then

local s = r:lower():match( "^example%.([a-z][a-z][a-z])$" )

if s then

r = ( s == "com" or

s == "edu" or

s == "net" or

s == "org" )

else

r = false

end

end

return r

end -- URLutil.isDomainExample()

URLutil.isDomainInt = function ( url )

-- Internationalized Domain Name (Punycode)

local r = URLutil.getHost( url )

if r then

if r:match( "^[!-~]+$" ) then

local s = "." .. r

if s:find( ".xn--", 1, true ) then

r = true

else

r = false

end

else

r = true

end

end

return r

end -- URLutil.isDomainInt()

URLutil.isHost = function ( s )

return URLutil.isDomain( s ) or URLutil.isIP( s )

end -- URLutil.isHost()

URLutil.isIP = function ( s )

return URLutil.isIPv4( s ) and 4 or URLutil.isIPv6( s ) and 6

end -- URLutil.isIP()

URLutil.isIPlocal = function ( s )

-- IPv4 according to RFC 1918, RFC 1122; even any 0.0.0.0 (RFC 5735)

local r = false

local num = s:match( "^ *([01][0-9]*)%." )

if num then

num = tonumber( num )

if num == 0 then

r = s:match( "^ *0+%.[0-9]+%.[0-9]+%.[0-9]+ *$" )

elseif num == 10 or num == 127 then

-- loopback; private/local host: 127.0.0.1

r = URLutil.isIPv4( s )

elseif num == 169 then

-- 169.254.*.*

elseif num == 172 then

-- 172.(16...31).*.*

num = s:match( "^ *0*172%.([0-9]+)%." )

if num then

num = tonumber( num )

if num >= 16 and num <= 31 then

r = URLutil.isIPv4( s )

end

end

elseif beg == 192 then

-- 192.168.*.*

num = s:match( "^ *0*192%.([0-9]+)%." )

if num then

num = tonumber( num )

if num == 168 then

r = URLutil.isIPv4( s )

end

end

end

end

if r then

r = true

end

return r

end -- URLutil.isIPlocal()

URLutil.isIPv4 = function ( s )

local function legal( n )

return ( tonumber( n ) < 256 )

end

local r = false

if type( s ) == "string" then

local p1, p2, p3, p4 = s:match( "^%s*([1-9][0-9]?[0-9]?)%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%.([12]?[0-9]?[0-9])%s*$" )

if p1 and p2 and p3 and p4 then

r = legal( p1 ) and legal( p2 ) and legal( p3 ) and legal( p4 )

end

end

return r

end -- URLutil.isIPv4()

URLutil.isIPv6 = function ( s )

local dcolon, groups

if type( s ) ~= "string"

or s:len() == 0

or s:find( "[^:%x]" ) -- only colon and hex digits are legal chars

or s:find( "^:[^:]" ) -- can begin or end with :: but not with single :

or s:find( "[^:]:$" )

or s:find( ":::" )

then

return false

end

s = mw.text.trim( s )

s, dcolon = s:gsub( "::", ":" )

if dcolon > 1 then

return false

end -- at most one ::

s = s:gsub( "^:?", ":" ) -- prepend : if needed, upper

s, groups = s:gsub( ":%x%x?%x?%x?", "" ) -- remove valid groups, and count them

return ( ( dcolon == 1 and groups < 8 ) or

( dcolon == 0 and groups == 8 ) )

and ( s:len() == 0 or ( dcolon == 1 and s == ":" ) ) -- might be one dangling : if original ended with ::

end -- URLutil.isIPv6()

URLutil.isMailAddress = function ( s )

if type( s ) == "string" then

s = mw.ustring.match( s, "^%s*[%w%.%%_-]+@([%w%.%%-]+)%s*$" )

return URLutil.isDomain( s )

end

return false

end -- URLutil.isMailAddress()

URLutil.isMailLink = function ( s )

if type( s ) == "string" then

local addr

s, addr = mw.ustring.match( s, "^%s*([Mm][Aa][Ii][Ll][Tt][Oo]):(%S[%w%.%%_-]*@[%w%.%%-]+)%s*$" )

if type( s ) == "string" then

if s:lower() == "mailto" then

return URLutil.isMailAddress( addr )

end

end

end

return false

end -- URLutil.isMailLink()

local function isProtocolAccepted( prot, supplied )

if type( prot ) == "string" then

local scheme, colon, slashes = mw.ustring.match( prot, "^%s*([a-zA-Z]*)(:?)(/?/?)%s*$" )

if slashes ~= "/" then

if scheme == "" then

if colon ~= ":" and slashes == "//" then

return true

end

elseif colon == ":" or slashes == "" then

local s = supplied:match( " " .. scheme:lower() .. " " )

if type( s ) == "string" then

return true

end

end

end

end

return false

end -- isProtocolAccepted()

URLutil.isProtocolMW = function ( prot )

return isProtocolAccepted( prot,

" http https ftp ftps ssh sftp irc ircs xmpp sip sips gopher telnet nntp worldwind mailto tel sms news svn git mms bitcoin magnet urn geo " )

end -- URLutil.isProtocolMW()

URLutil.isProtocolDialog = function ( prot )

return isProtocolAccepted( prot, " mailto irc ircs ssh telnet " )

end -- URLutil.isProtocolDialog()

URLutil.isProtocolWiki = function ( prot )

return isProtocolAccepted( prot,

" ftp ftps git http https nntp sftp svn worldwind " )

end -- URLutil.isProtocolWiki()

URLutil.isResourceURL = function ( url )

local scheme = URLutil.getScheme( url )

if scheme then

local s = " // http:// https:// ftp:// sftp:// "

s = s:find( string.format( " %s ", scheme ) )

if s then

if URLutil.getAuthority( url ) then

if not url:match( "%S%s+%S" ) then

return true

end

end

end

end

return false

end -- URLutil.isResourceURL()

URLutil.isSuspiciousURL = function ( url )

if URLutil.isResourceURL( url ) then

local s = URLutil.getAuthority( url )

local pat = "[%[|%]" ..

mw.ustring.char( 8201, 45, 8207, 8234, 45, 8239, 8288 )

.. "]"

if s:find( "@" )

or url:find( "''" )

or url:find( pat )

or url:find( "[%.,]$" ) then

return true

end

-- TODO zero width character ??

return false

end

return true

end -- URLutil.isSuspiciousURL()

URLutil.isUnescapedURL = function ( url, trailing )

if type( trailing ) ~= "string" then

if URLutil.isWebURL( url ) then

if url:match( "[%[|%]]" ) then

return true

end

end

end

return false

end -- URLutil.isUnescapedURL()

URLutil.isWebURL = function ( url )

if URLutil.getScheme( url ) and URLutil.getAuthority( url ) then

if not url:match( "%S%s+%S" ) then

return true

end

end

return false

end -- URLutil.isWebURL()

URLutil.wikiEscapeURL = function ( url )

if url:find( "[%[|%]]" ) then

local n

url, n = url:gsub( "%[", "[" )

:gsub( "|", "|" )

:gsub( "%]", "]" )

end

return url

end -- URLutil.wikiEscapeURL()

-- Provide template access and expose URLutil table to require

local p = {}

function p.getURIScheme( frame )

return URLutil.getURIScheme( frame.args[ 1 ] ) or ""

end

function p.getAuthority( frame )

return URLutil.getAuthority( frame.args[ 1 ] ) or ""

end

function p.getFragment( frame )

local r = URLutil.getFragment( frame.args[ 1 ], frame.args[ 2 ] )

if r then

r = "#" .. r

else

r = ""

end

return r

end

function p.getHost( frame )

return URLutil.getHost( frame.args[ 1 ] ) or ""

end

function p.getLocation( frame )

return URLutil.getLocation( frame.args[ 1 ] ) or ""

end

function p.getPath( frame )

return URLutil.getPath( frame.args[ 1 ] ) or ""

end

function p.getPort( frame )

return URLutil.getPort( frame.args[ 1 ] ) or ""

end

function p.getQuery( frame )

local r

local key = frame.args[ 2 ]

if key then

key = mw.text.trim( key )

if key == "" then

key = nil

end

end

r = URLutil.getQuery( frame.args[ 1 ], key, frame.args[ 3 ] )

if r then

if not key then

r = "?" .. r

end

else

r = ""

end

return r

end

function p.getRelativePath( frame )

return URLutil.getRelativePath( frame.args[ 1 ] ) or ""

end

function p.getScheme( frame )

return URLutil.getScheme( frame.args[ 1 ] ) or ""

end

function p.getTLD( frame )

return URLutil.getTLD( frame.args[ 1 ] ) or ""

end

function p.getTop2domain( frame )

return URLutil.getTop2domain( frame.args[ 1 ] ) or ""

end

function p.getTop3domain( frame )

return URLutil.getTop3domain( frame.args[ 1 ] ) or ""

end

function p.isAuthority( frame )

return URLutil.isAuthority( frame.args[ 1 ] ) and "1" or ""

end

function p.isDomain( frame )

return URLutil.isDomain( frame.args[ 1 ] ) and "1" or ""

end

function p.isDomainExample( frame )

return URLutil.isDomainExample( frame.args[ 1 ] ) and "1" or ""

end

function p.isDomainInt( frame )

return URLutil.isDomainInt( frame.args[ 1 ] ) and "1" or ""

end

function p.isHost( frame )

return URLutil.isHost( frame.args[ 1 ] ) and "1" or ""

end

function p.isIP( frame )

return URLutil.isIP( frame.args[ 1 ] ) or ""

end

function p.isIPlocal( frame )

return URLutil.isIPlocal( frame.args[ 1 ] ) and "1" or ""

end

function p.isIPv4( frame )

return URLutil.isIPv4( frame.args[ 1 ] ) and "1" or ""

end

function p.isIPv6( frame )

return URLutil.isIPv6( frame.args[ 1 ] ) and "1" or ""

end

function p.isMailAddress( frame )

return URLutil.isMailAddress( frame.args[ 1 ] ) and "1" or ""

end

function p.isMailLink( frame )

return URLutil.isMailLink( frame.args[ 1 ] ) and "1" or ""

end

function p.isProtocolMW( frame )

return URLutil.isProtocolMW( frame.args[ 1 ] ) and "1" or ""

end

function p.isProtocolDialog( frame )

return URLutil.isProtocolDialog( frame.args[ 1 ] ) and "1" or ""

end

function p.isProtocolWiki( frame )

return URLutil.isProtocolWiki( frame.args[ 1 ] ) and "1" or ""

end

function p.isResourceURL( frame )

return URLutil.isResourceURL( frame.args[ 1 ] ) and "1" or ""

end

function p.isSuspiciousURL( frame )

return URLutil.isSuspiciousURL( frame.args[ 1 ] ) and "1" or ""

end

function p.isUnescapedURL( frame )

return URLutil.isUnescapedURL( frame.args[ 1 ], frame.args[ 2 ] ) and "1" or ""

end

function p.isWebURL( frame )

return URLutil.isWebURL( frame.args[ 1 ] ) and "1" or ""

end

function p.wikiEscapeURL( frame )

return URLutil.wikiEscapeURL( frame.args[ 1 ] )

end

function p.URLutil()

return URLutil

end

return p