You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

456 lines
14 KiB
Lua

--- @module Class providing the actual XML parser.
-- Available options are:
-- * stripWS
-- Strip non-significant whitespace (leading/trailing)
-- and do not generate events for empty text elements
--
-- * expandEntities
-- Expand entities (standard entities + single char
-- numeric entities only currently - could be extended
-- at runtime if suitable DTD parser added elements
-- to table (see obj._ENTITIES). May also be possible
-- to expand multibyre entities for UTF-8 only
--
-- * errorHandler
-- Custom error handler function
--
-- NOTE: Boolean options must be set to 'nil' not '0'
---Converts the decimal code of a character to its corresponding char
--if it's a graphical char, otherwise, returns the HTML ISO code
--for that decimal value in the format &#code
--@param code the decimal value to convert to its respective character
local function decimalToHtmlChar(code)
local n = tonumber(code)
if n >= 0 and n < 256 then
return string.char(n)
else
return "&#"..code..";"
end
end
---Converts the hexadecimal code of a character to its corresponding char
--if it's a graphical char, otherwise, returns the HTML ISO code
--for that hexadecimal value in the format &#xCode
--@param code the hexadecimal value to convert to its respective character
local function hexadecimalToHtmlChar(code)
local n = tonumber(code, 16)
if n >= 0 and n < 256 then
return string.char(n)
else
return "&#x"..code..";"
end
end
local XmlParser = {
-- Private attribures/functions
_XML = '^([^<]*)<(%/?)([^>]-)(%/?)>',
_ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"',
_ATTR2 = '([%w-:_]+)%s*=%s*\'(.-)\'',
_CDATA = '<%!%[CDATA%[(.-)%]%]>',
_PI = '<%?(.-)%?>',
_COMMENT = '<!%-%-(.-)%-%->',
_TAG = '^(.-)%s.*',
_LEADINGWS = '^%s+',
_TRAILINGWS = '%s+$',
_WS = '^%s*$',
_DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>',
_DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>',
_DTD3 = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>',
_DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>',
_DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>',
--Matches an attribute with non-closing double quotes (The equal sign is matched non-greedly by using =+?)
_ATTRERR1 = '=+?%s*"[^"]*$',
--Matches an attribute with non-closing single quotes (The equal sign is matched non-greedly by using =+?)
_ATTRERR2 = '=+?%s*\'[^\']*$',
--Matches a closing tag such as </person> or the end of a openning tag such as <person>
_TAGEXT = '(%/?)>',
_errstr = {
xmlErr = "Error Parsing XML",
declErr = "Error Parsing XMLDecl",
declStartErr = "XMLDecl not at start of document",
declAttrErr = "Invalid XMLDecl attributes",
piErr = "Error Parsing Processing Instruction",
commentErr = "Error Parsing Comment",
cdataErr = "Error Parsing CDATA",
dtdErr = "Error Parsing DTD",
endTagErr = "End Tag Attributes Invalid",
unmatchedTagErr = "Unbalanced Tag",
incompleteXmlErr = "Incomplete XML Document",
},
_ENTITIES = {
["&lt;"] = "<",
["&gt;"] = ">",
["&amp;"] = "&",
["&quot;"] = '"',
["&apos;"] = "'",
["&#(%d+);"] = decimalToHtmlChar,
["&#x(%x+);"] = hexadecimalToHtmlChar,
},
}
--- Instantiates a XmlParser object.
--@param _handler Handler module to be used to convert the XML string
-- to another formats. See the available handlers at the handler directory.
-- Usually you get an instance to a handler module using, for instance:
-- local handler = require("xmlhandler/tree").
--@param _options Options for this XmlParser instance.
--@see XmlParser.options
function XmlParser.new(_handler, _options)
local obj = {
handler = _handler,
options = _options,
_stack = {}
}
setmetatable(obj, XmlParser)
obj.__index = XmlParser
return obj;
end
---Checks if a function/field exists in a table or in its metatable
--@param table the table to check if it has a given function
--@param elementName the name of the function/field to check if exists
--@return true if the function/field exists, false otherwise
local function fexists(table, elementName)
if table == nil then
return false
end
if table[elementName] ~= nil then
return true
else
return fexists(getmetatable(table), elementName)
end
end
local function err(self, err, pos)
if self.options.errorHandler then
self.options.errorHandler(err,pos)
end
end
--- Removes whitespaces
local function stripWS(self, s)
if self.options.stripWS then
s = string.gsub(s,'^%s+','')
s = string.gsub(s,'%s+$','')
end
return s
end
local function parseEntities(self, s)
if self.options.expandEntities then
--for k,v in self._ENTITIES do
for k,v in pairs(self._ENTITIES) do
--print (k, v)
s = string.gsub(s,k,v)
end
end
return s
end
--- Parses a string representing a tag.
--@param s String containing tag text
--@return a {name, attrs} table
-- where name is the name of the tag and attrs
-- is a table containing the atributtes of the tag
local function parseTag(self, s)
local tag = {
name = string.gsub(s, self._TAG, '%1'),
attrs = {}
}
local parseFunction = function (k, v)
tag.attrs[k] = parseEntities(self, v)
tag.attrs._ = 1
end
string.gsub(s, self._ATTR1, parseFunction)
string.gsub(s, self._ATTR2, parseFunction)
if tag.attrs._ then
tag.attrs._ = nil
else
tag.attrs = nil
end
return tag
end
local function parseXmlDeclaration(self, xml, f)
-- XML Declaration
f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos)
if not f.match then
err(self, self._errstr.declErr, f.pos)
end
if f.match ~= 1 then
-- Must be at start of doc if present
err(self, self._errstr.declStartErr, f.pos)
end
local tag = parseTag(self, f.text)
-- TODO: Check if attributes are valid
-- Check for version (mandatory)
if tag.attrs and tag.attrs.version == nil then
err(self, self._errstr.declAttrErr, f.pos)
end
if fexists(self.handler, 'decl') then
self.handler:decl(tag, f.match, f.endMatch)
end
return tag
end
local function parseXmlProcessingInstruction(self, xml, f)
local tag = {}
-- XML Processing Instruction (PI)
f.match, f.endMatch, f.text = string.find(xml, self._PI, f.pos)
if not f.match then
err(self, self._errstr.piErr, f.pos)
end
if fexists(self.handler, 'pi') then
-- Parse PI attributes & text
tag = parseTag(self, f.text)
local pi = string.sub(f.text, string.len(tag.name)+1)
if pi ~= "" then
if tag.attrs then
tag.attrs._text = pi
else
tag.attrs = { _text = pi }
end
end
self.handler:pi(tag, f.match, f.endMatch)
end
return tag
end
local function parseComment(self, xml, f)
f.match, f.endMatch, f.text = string.find(xml, self._COMMENT, f.pos)
if not f.match then
err(self, self._errstr.commentErr, f.pos)
end
if fexists(self.handler, 'comment') then
f.text = parseEntities(self, stripWS(self, f.text))
self.handler:comment(f.text, next, f.match, f.endMatch)
end
end
local function _parseDtd(self, xml, pos)
-- match,endMatch,root,type,name,uri,internal
local m,e,r,t,n,u,i
m,e,r,t,u,i = string.find(xml, self._DTD1,pos)
if m then
return m, e, {_root=r,_type=t,_uri=u,_internal=i}
end
m,e,r,t,n,u,i = string.find(xml, self._DTD2,pos)
if m then
return m, e, {_root=r,_type=t,_name=n,_uri=u,_internal=i}
end
m,e,r,i = string.find(xml, self._DTD3,pos)
if m then
return m, e, {_root=r,_internal=i}
end
m,e,r,t,u = string.find(s,self._DTD4,pos)
if m then
return m,e,{_root=r,_type=t,_uri=u}
end
m,e,r,t,n,u = string.find(s,self._DTD5,pos)
if m then
return m,e,{_root=r,_type=t,_name=n,_uri=u}
end
return nil
end
local function parseDtd(self, xml, f)
f.match, f.endMatch, attrs = self:_parseDtd(xml, f.pos)
if not f.match then
err(self, self._errstr.dtdErr, f.pos)
end
if fexists(self.handler, 'dtd') then
self.handler:dtd(attrs._root, attrs, f.match, f.endMatch)
end
end
local function parseCdata(self, xml, f)
f.match, f.endMatch, f.text = string.find(xml, self._CDATA, f.pos)
if not f.match then
err(self, self._errstr.cdataErr, f.pos)
end
if fexists(self.handler, 'cdata') then
self.handler:cdata(f.text, nil, f.match, f.endMatch)
end
end
--- Parse a Normal tag
-- Need check for embedded '>' in attribute value and extend
-- match recursively if necessary eg. <tag attr="123>456">
local function parseNormalTag(self, xml, f)
--Check for errors
while 1 do
--If there isn't an attribute without closing quotes (single or double quotes)
--then breaks to follow the normal processing of the tag.
--Otherwise, try to find where the quotes close.
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR1)
if f.errEnd == nil then
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR2)
if f.errEnd == nil then
break
end
end
f.extStart, f.extEnd, f.endt2 = string.find(xml, self._TAGEXT, f.endMatch+1)
f.tagstr = f.tagstr .. string.sub(xml, f.endMatch, f.extEnd-1)
if not f.match then
err(self, self._errstr.xmlErr, f.pos)
end
f.endMatch = f.extEnd
end
-- Extract tag name and attrs
local tag = parseTag(self, f.tagstr)
if (f.endt1=="/") then
if fexists(self.handler, 'endtag') then
if tag.attrs then
-- Shouldn't have any attributes in endtag
err(self, string.format("%s (/%s)", self._errstr.endTagErr, tag.name), f.pos)
end
if table.remove(self._stack) ~= tag.name then
err(self, string.format("%s (/%s)", self._errstr.unmatchedTagErr, tag.name), f.pos)
end
self.handler:endtag(tag, f.match, f.endMatch)
end
else
table.insert(self._stack, tag.name)
if fexists(self.handler, 'starttag') then
self.handler:starttag(tag, f.match, f.endMatch)
end
--TODO: Tags com fechamento automático estão sendo
--retornadas como uma tabela, o que complica
--para a app NCLua tratar isso. É preciso
--fazer com que seja retornado um campo string vazio.
-- Self-Closing Tag
if (f.endt2=="/") then
table.remove(self._stack)
if fexists(self.handler, 'endtag') then
self.handler:endtag(tag, f.match, f.endMatch)
end
end
end
return tag
end
local function parseTagType(self, xml, f)
-- Test for tag type
if string.find(string.sub(f.tagstr, 1, 5), "?xml%s") then
parseXmlDeclaration(self, xml, f)
elseif string.sub(f.tagstr, 1, 1) == "?" then
parseXmlProcessingInstruction(self, xml, f)
elseif string.sub(f.tagstr, 1, 3) == "!--" then
parseComment(self, xml, f)
elseif string.sub(f.tagstr, 1, 8) == "!DOCTYPE" then
parseDtd(self, xml, f)
elseif string.sub(f.tagstr, 1, 8) == "![CDATA[" then
parseCdata(self, xml, f)
else
parseNormalTag(self, xml, f)
end
end
--- Get next tag (first pass - fix exceptions below).
--@return true if the next tag could be got, false otherwise
local function getNextTag(self, xml, f)
f.match, f.endMatch, f.text, f.endt1, f.tagstr, f.endt2 = string.find(xml, self._XML, f.pos)
if not f.match then
if string.find(xml, self._WS, f.pos) then
-- No more text - check document complete
if #self._stack ~= 0 then
err(self, self._errstr.incompleteXmlErr, f.pos)
else
return false
end
else
-- Unparsable text
err(self, self._errstr.xmlErr, f.pos)
end
end
f.text = f.text or ''
f.tagstr = f.tagstr or ''
f.match = f.match or 0
return f.endMatch ~= nil
end
--Main function which starts the XML parsing process
--@param xml the XML string to parse
--@param parseAttributes indicates if tag attributes should be parsed or not.
-- If omitted, the default value is true.
function XmlParser:parse(xml, parseAttributes)
if type(self) ~= "table" or getmetatable(self) ~= XmlParser then
error("You must call xmlparser:parse(parameters) instead of xmlparser.parse(parameters)")
end
if parseAttributes == nil then
parseAttributes = true
end
self.handler.parseAttributes = parseAttributes
--Stores string.find results and parameters
--and other auxiliar variables
local f = {
--string.find return
match = 0,
endMatch = 0,
-- text, end1, tagstr, end2,
--string.find parameters and auxiliar variables
pos = 1,
-- startText, endText,
-- errStart, errEnd, extStart, extEnd,
}
while f.match do
if not getNextTag(self, xml, f) then
break
end
-- Handle leading text
f.startText = f.match
f.endText = f.match + string.len(f.text) - 1
f.match = f.match + string.len(f.text)
f.text = parseEntities(self, stripWS(self, f.text))
if f.text ~= "" and fexists(self.handler, 'text') then
self.handler:text(f.text, nil, f.match, f.endText)
end
parseTagType(self, xml, f)
f.pos = f.endMatch + 1
end
end
XmlParser.__index = XmlParser
return XmlParser