You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
456 lines
14 KiB
Lua
456 lines
14 KiB
Lua
--- @module Class providing the actual XML parser.
|
|
-- Available options are:
|
|
-- * stripWS
|
|
-- Strip non-significant whitespace (leading/trailing)
|
|
-- and do not generate events for empty text elements
|
|
--
|
|
-- * expandEntities
|
|
-- Expand entities (standard entities + single char
|
|
-- numeric entities only currently - could be extended
|
|
-- at runtime if suitable DTD parser added elements
|
|
-- to table (see obj._ENTITIES). May also be possible
|
|
-- to expand multibyre entities for UTF-8 only
|
|
--
|
|
-- * errorHandler
|
|
-- Custom error handler function
|
|
--
|
|
-- NOTE: Boolean options must be set to 'nil' not '0'
|
|
|
|
---Converts the decimal code of a character to its corresponding char
|
|
--if it's a graphical char, otherwise, returns the HTML ISO code
|
|
--for that decimal value in the format &#code
|
|
--@param code the decimal value to convert to its respective character
|
|
local function decimalToHtmlChar(code)
|
|
local n = tonumber(code)
|
|
if n >= 0 and n < 256 then
|
|
return string.char(n)
|
|
else
|
|
return "&#"..code..";"
|
|
end
|
|
end
|
|
|
|
---Converts the hexadecimal code of a character to its corresponding char
|
|
--if it's a graphical char, otherwise, returns the HTML ISO code
|
|
--for that hexadecimal value in the format ode
|
|
--@param code the hexadecimal value to convert to its respective character
|
|
local function hexadecimalToHtmlChar(code)
|
|
local n = tonumber(code, 16)
|
|
if n >= 0 and n < 256 then
|
|
return string.char(n)
|
|
else
|
|
return "&#x"..code..";"
|
|
end
|
|
end
|
|
|
|
local XmlParser = {
|
|
-- Private attribures/functions
|
|
_XML = '^([^<]*)<(%/?)([^>]-)(%/?)>',
|
|
_ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"',
|
|
_ATTR2 = '([%w-:_]+)%s*=%s*\'(.-)\'',
|
|
_CDATA = '<%!%[CDATA%[(.-)%]%]>',
|
|
_PI = '<%?(.-)%?>',
|
|
_COMMENT = '<!%-%-(.-)%-%->',
|
|
_TAG = '^(.-)%s.*',
|
|
_LEADINGWS = '^%s+',
|
|
_TRAILINGWS = '%s+$',
|
|
_WS = '^%s*$',
|
|
_DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>',
|
|
_DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>',
|
|
_DTD3 = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>',
|
|
_DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>',
|
|
_DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>',
|
|
|
|
--Matches an attribute with non-closing double quotes (The equal sign is matched non-greedly by using =+?)
|
|
_ATTRERR1 = '=+?%s*"[^"]*$',
|
|
--Matches an attribute with non-closing single quotes (The equal sign is matched non-greedly by using =+?)
|
|
_ATTRERR2 = '=+?%s*\'[^\']*$',
|
|
--Matches a closing tag such as </person> or the end of a openning tag such as <person>
|
|
_TAGEXT = '(%/?)>',
|
|
|
|
_errstr = {
|
|
xmlErr = "Error Parsing XML",
|
|
declErr = "Error Parsing XMLDecl",
|
|
declStartErr = "XMLDecl not at start of document",
|
|
declAttrErr = "Invalid XMLDecl attributes",
|
|
piErr = "Error Parsing Processing Instruction",
|
|
commentErr = "Error Parsing Comment",
|
|
cdataErr = "Error Parsing CDATA",
|
|
dtdErr = "Error Parsing DTD",
|
|
endTagErr = "End Tag Attributes Invalid",
|
|
unmatchedTagErr = "Unbalanced Tag",
|
|
incompleteXmlErr = "Incomplete XML Document",
|
|
},
|
|
|
|
_ENTITIES = {
|
|
["<"] = "<",
|
|
[">"] = ">",
|
|
["&"] = "&",
|
|
["""] = '"',
|
|
["'"] = "'",
|
|
["&#(%d+);"] = decimalToHtmlChar,
|
|
["&#x(%x+);"] = hexadecimalToHtmlChar,
|
|
},
|
|
}
|
|
|
|
--- Instantiates a XmlParser object.
|
|
--@param _handler Handler module to be used to convert the XML string
|
|
-- to another formats. See the available handlers at the handler directory.
|
|
-- Usually you get an instance to a handler module using, for instance:
|
|
-- local handler = require("xmlhandler/tree").
|
|
--@param _options Options for this XmlParser instance.
|
|
--@see XmlParser.options
|
|
function XmlParser.new(_handler, _options)
|
|
local obj = {
|
|
handler = _handler,
|
|
options = _options,
|
|
_stack = {}
|
|
}
|
|
|
|
setmetatable(obj, XmlParser)
|
|
obj.__index = XmlParser
|
|
return obj;
|
|
end
|
|
|
|
---Checks if a function/field exists in a table or in its metatable
|
|
--@param table the table to check if it has a given function
|
|
--@param elementName the name of the function/field to check if exists
|
|
--@return true if the function/field exists, false otherwise
|
|
local function fexists(table, elementName)
|
|
if table == nil then
|
|
return false
|
|
end
|
|
|
|
if table[elementName] ~= nil then
|
|
return true
|
|
else
|
|
return fexists(getmetatable(table), elementName)
|
|
end
|
|
end
|
|
|
|
local function err(self, err, pos)
|
|
if self.options.errorHandler then
|
|
self.options.errorHandler(err,pos)
|
|
end
|
|
end
|
|
|
|
--- Removes whitespaces
|
|
local function stripWS(self, s)
|
|
if self.options.stripWS then
|
|
s = string.gsub(s,'^%s+','')
|
|
s = string.gsub(s,'%s+$','')
|
|
end
|
|
return s
|
|
end
|
|
|
|
local function parseEntities(self, s)
|
|
if self.options.expandEntities then
|
|
--for k,v in self._ENTITIES do
|
|
for k,v in pairs(self._ENTITIES) do
|
|
--print (k, v)
|
|
s = string.gsub(s,k,v)
|
|
end
|
|
end
|
|
|
|
return s
|
|
end
|
|
|
|
--- Parses a string representing a tag.
|
|
--@param s String containing tag text
|
|
--@return a {name, attrs} table
|
|
-- where name is the name of the tag and attrs
|
|
-- is a table containing the atributtes of the tag
|
|
local function parseTag(self, s)
|
|
local tag = {
|
|
name = string.gsub(s, self._TAG, '%1'),
|
|
attrs = {}
|
|
}
|
|
|
|
local parseFunction = function (k, v)
|
|
tag.attrs[k] = parseEntities(self, v)
|
|
tag.attrs._ = 1
|
|
end
|
|
|
|
string.gsub(s, self._ATTR1, parseFunction)
|
|
string.gsub(s, self._ATTR2, parseFunction)
|
|
|
|
if tag.attrs._ then
|
|
tag.attrs._ = nil
|
|
else
|
|
tag.attrs = nil
|
|
end
|
|
|
|
return tag
|
|
end
|
|
|
|
local function parseXmlDeclaration(self, xml, f)
|
|
-- XML Declaration
|
|
f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos)
|
|
if not f.match then
|
|
err(self, self._errstr.declErr, f.pos)
|
|
end
|
|
|
|
if f.match ~= 1 then
|
|
-- Must be at start of doc if present
|
|
err(self, self._errstr.declStartErr, f.pos)
|
|
end
|
|
|
|
local tag = parseTag(self, f.text)
|
|
-- TODO: Check if attributes are valid
|
|
-- Check for version (mandatory)
|
|
if tag.attrs and tag.attrs.version == nil then
|
|
err(self, self._errstr.declAttrErr, f.pos)
|
|
end
|
|
|
|
if fexists(self.handler, 'decl') then
|
|
self.handler:decl(tag, f.match, f.endMatch)
|
|
end
|
|
|
|
return tag
|
|
end
|
|
|
|
local function parseXmlProcessingInstruction(self, xml, f)
|
|
local tag = {}
|
|
|
|
-- XML Processing Instruction (PI)
|
|
f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos)
|
|
if not f.match then
|
|
err(self, self._errstr.piErr, f.pos)
|
|
end
|
|
if fexists(self.handler, 'pi') then
|
|
-- Parse PI attributes & text
|
|
tag = parseTag(self, f.text)
|
|
local pi = string.sub(f.text, string.len(tag.name)+1)
|
|
if pi ~= "" then
|
|
if tag.attrs then
|
|
tag.attrs._text = pi
|
|
else
|
|
tag.attrs = { _text = pi }
|
|
end
|
|
end
|
|
self.handler:pi(tag, f.match, f.endMatch)
|
|
end
|
|
|
|
return tag
|
|
end
|
|
|
|
local function parseComment(self, xml, f)
|
|
f.match, f.endMatch, f.text = string.find(xml, self._COMMENT, f.pos)
|
|
if not f.match then
|
|
err(self, self._errstr.commentErr, f.pos)
|
|
end
|
|
|
|
if fexists(self.handler, 'comment') then
|
|
f.text = parseEntities(self, stripWS(self, f.text))
|
|
self.handler:comment(f.text, next, f.match, f.endMatch)
|
|
end
|
|
end
|
|
|
|
local function _parseDtd(self, xml, pos)
|
|
-- match,endMatch,root,type,name,uri,internal
|
|
local m,e,r,t,n,u,i
|
|
|
|
m,e,r,t,u,i = string.find(xml, self._DTD1,pos)
|
|
if m then
|
|
return m, e, {_root=r,_type=t,_uri=u,_internal=i}
|
|
end
|
|
|
|
m,e,r,t,n,u,i = string.find(xml, self._DTD2,pos)
|
|
if m then
|
|
return m, e, {_root=r,_type=t,_name=n,_uri=u,_internal=i}
|
|
end
|
|
|
|
m,e,r,i = string.find(xml, self._DTD3,pos)
|
|
if m then
|
|
return m, e, {_root=r,_internal=i}
|
|
end
|
|
|
|
m,e,r,t,u = string.find(s,self._DTD4,pos)
|
|
if m then
|
|
return m,e,{_root=r,_type=t,_uri=u}
|
|
end
|
|
|
|
m,e,r,t,n,u = string.find(s,self._DTD5,pos)
|
|
if m then
|
|
return m,e,{_root=r,_type=t,_name=n,_uri=u}
|
|
end
|
|
|
|
return nil
|
|
end
|
|
|
|
local function parseDtd(self, xml, f)
|
|
f.match, f.endMatch, attrs = self:_parseDtd(xml, f.pos)
|
|
if not f.match then
|
|
err(self, self._errstr.dtdErr, f.pos)
|
|
end
|
|
|
|
if fexists(self.handler, 'dtd') then
|
|
self.handler:dtd(attrs._root, attrs, f.match, f.endMatch)
|
|
end
|
|
end
|
|
|
|
local function parseCdata(self, xml, f)
|
|
f.match, f.endMatch, f.text = string.find(xml, self._CDATA, f.pos)
|
|
if not f.match then
|
|
err(self, self._errstr.cdataErr, f.pos)
|
|
end
|
|
|
|
if fexists(self.handler, 'cdata') then
|
|
self.handler:cdata(f.text, nil, f.match, f.endMatch)
|
|
end
|
|
end
|
|
|
|
--- Parse a Normal tag
|
|
-- Need check for embedded '>' in attribute value and extend
|
|
-- match recursively if necessary eg. <tag attr="123>456">
|
|
local function parseNormalTag(self, xml, f)
|
|
--Check for errors
|
|
while 1 do
|
|
--If there isn't an attribute without closing quotes (single or double quotes)
|
|
--then breaks to follow the normal processing of the tag.
|
|
--Otherwise, try to find where the quotes close.
|
|
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR1)
|
|
|
|
if f.errEnd == nil then
|
|
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR2)
|
|
if f.errEnd == nil then
|
|
break
|
|
end
|
|
end
|
|
|
|
f.extStart, f.extEnd, f.endt2 = string.find(xml, self._TAGEXT, f.endMatch+1)
|
|
f.tagstr = f.tagstr .. string.sub(xml, f.endMatch, f.extEnd-1)
|
|
if not f.match then
|
|
err(self, self._errstr.xmlErr, f.pos)
|
|
end
|
|
f.endMatch = f.extEnd
|
|
end
|
|
|
|
-- Extract tag name and attrs
|
|
local tag = parseTag(self, f.tagstr)
|
|
|
|
if (f.endt1=="/") then
|
|
if fexists(self.handler, 'endtag') then
|
|
if tag.attrs then
|
|
-- Shouldn't have any attributes in endtag
|
|
err(self, string.format("%s (/%s)", self._errstr.endTagErr, tag.name), f.pos)
|
|
end
|
|
if table.remove(self._stack) ~= tag.name then
|
|
err(self, string.format("%s (/%s)", self._errstr.unmatchedTagErr, tag.name), f.pos)
|
|
end
|
|
self.handler:endtag(tag, f.match, f.endMatch)
|
|
end
|
|
else
|
|
table.insert(self._stack, tag.name)
|
|
if fexists(self.handler, 'starttag') then
|
|
self.handler:starttag(tag, f.match, f.endMatch)
|
|
end
|
|
--TODO: Tags com fechamento automático estão sendo
|
|
--retornadas como uma tabela, o que complica
|
|
--para a app NCLua tratar isso. É preciso
|
|
--fazer com que seja retornado um campo string vazio.
|
|
|
|
-- Self-Closing Tag
|
|
if (f.endt2=="/") then
|
|
table.remove(self._stack)
|
|
if fexists(self.handler, 'endtag') then
|
|
self.handler:endtag(tag, f.match, f.endMatch)
|
|
end
|
|
end
|
|
end
|
|
|
|
return tag
|
|
end
|
|
|
|
local function parseTagType(self, xml, f)
|
|
-- Test for tag type
|
|
if string.find(string.sub(f.tagstr, 1, 5), "?xml%s") then
|
|
parseXmlDeclaration(self, xml, f)
|
|
elseif string.sub(f.tagstr, 1, 1) == "?" then
|
|
parseXmlProcessingInstruction(self, xml, f)
|
|
elseif string.sub(f.tagstr, 1, 3) == "!--" then
|
|
parseComment(self, xml, f)
|
|
elseif string.sub(f.tagstr, 1, 8) == "!DOCTYPE" then
|
|
parseDtd(self, xml, f)
|
|
elseif string.sub(f.tagstr, 1, 8) == "![CDATA[" then
|
|
parseCdata(self, xml, f)
|
|
else
|
|
parseNormalTag(self, xml, f)
|
|
end
|
|
end
|
|
|
|
--- Get next tag (first pass - fix exceptions below).
|
|
--@return true if the next tag could be got, false otherwise
|
|
local function getNextTag(self, xml, f)
|
|
f.match, f.endMatch, f.text, f.endt1, f.tagstr, f.endt2 = string.find(xml, self._XML, f.pos)
|
|
if not f.match then
|
|
if string.find(xml, self._WS, f.pos) then
|
|
-- No more text - check document complete
|
|
if #self._stack ~= 0 then
|
|
err(self, self._errstr.incompleteXmlErr, f.pos)
|
|
else
|
|
return false
|
|
end
|
|
else
|
|
-- Unparsable text
|
|
err(self, self._errstr.xmlErr, f.pos)
|
|
end
|
|
end
|
|
|
|
f.text = f.text or ''
|
|
f.tagstr = f.tagstr or ''
|
|
f.match = f.match or 0
|
|
|
|
return f.endMatch ~= nil
|
|
end
|
|
|
|
--Main function which starts the XML parsing process
|
|
--@param xml the XML string to parse
|
|
--@param parseAttributes indicates if tag attributes should be parsed or not.
|
|
-- If omitted, the default value is true.
|
|
function XmlParser:parse(xml, parseAttributes)
|
|
if type(self) ~= "table" or getmetatable(self) ~= XmlParser then
|
|
error("You must call xmlparser:parse(parameters) instead of xmlparser.parse(parameters)")
|
|
end
|
|
|
|
if parseAttributes == nil then
|
|
parseAttributes = true
|
|
end
|
|
|
|
self.handler.parseAttributes = parseAttributes
|
|
|
|
--Stores string.find results and parameters
|
|
--and other auxiliar variables
|
|
local f = {
|
|
--string.find return
|
|
match = 0,
|
|
endMatch = 0,
|
|
-- text, end1, tagstr, end2,
|
|
|
|
--string.find parameters and auxiliar variables
|
|
pos = 1,
|
|
-- startText, endText,
|
|
-- errStart, errEnd, extStart, extEnd,
|
|
}
|
|
|
|
while f.match do
|
|
if not getNextTag(self, xml, f) then
|
|
break
|
|
end
|
|
|
|
-- Handle leading text
|
|
f.startText = f.match
|
|
f.endText = f.match + string.len(f.text) - 1
|
|
f.match = f.match + string.len(f.text)
|
|
f.text = parseEntities(self, stripWS(self, f.text))
|
|
if f.text ~= "" and fexists(self.handler, 'text') then
|
|
self.handler:text(f.text, nil, f.match, f.endText)
|
|
end
|
|
|
|
parseTagType(self, xml, f)
|
|
f.pos = f.endMatch + 1
|
|
end
|
|
end
|
|
|
|
XmlParser.__index = XmlParser
|
|
return XmlParser
|