Module:HTMLParser/ElementNode

Revision as of 22:48, 6 August 2020 by Till Kraemer (talk | contribs) (Created page with "-- Derivative work of: --[[ (The MIT license) Copyright (c) 2013, Wouter Scherphof ([email protected]) Permission is hereby granted, free of charge, to any person...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Documentation for this module may be created at Module:HTMLParser/ElementNode/doc

-- Derivative work of:

--[[
	(The MIT license)
	Copyright (c) 2013, Wouter Scherphof ([email protected])
	Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
	The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--]]
 
-- Source: https://github.com/wscherphof/lua-htmlparser/tree/master/src/htmlparser

local Set = require("Module:Set")

local ElementNode = {}
ElementNode.mt = {__index = ElementNode}
function ElementNode:new(nameortext, node, descend, openstart, openend)
  local instance = {
    name = nameortext,
    level = 0,
    parent = nil,
    root = nil,
    nodes = {},
    _openstart = openstart, _openend = openend,
    _closestart = openstart, _closeend = openend,
    attributes = {},
    id = nil,
    classes = {},
    deepernodes = Set:new(),
    deeperelements = {}, deeperattributes = {}, deeperids = {}, deeperclasses = {}
  }
  if not node then
    instance.name = "root"
    instance.root = instance
    instance._text = nameortext
    local length = string.len(nameortext)
    instance._openstart, instance._openend = 1, length
    instance._closestart, instance._closeend = 1, length
  elseif descend then
    instance.root = node.root
    instance.parent = node
    instance.level = node.level + 1
    table.insert(node.nodes, instance)
  else
    instance.root = node.root
    instance.parent = node.parent
    instance.level = node.level
    table.insert(node.parent.nodes, instance)
  end
  return setmetatable(instance, ElementNode.mt)
end

function ElementNode:gettext()
  return string.sub(self.root._text, self._openstart, self._closeend)
end

function ElementNode:getcontent()
  return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
end

function ElementNode:addattribute(k, v)
  self.attributes[k] = v
  if string.lower(k) == "id" then
    self.id = v
  -- class attribute contains "space-separated tokens", each of which we'd like quick access to
  elseif string.lower(k) == "class" then
    for class in string.gmatch(v, "%S+") do
      table.insert(self.classes, class)
    end
  end
end

local function insert(table, name, node)
  table[name] = table[name] or Set:new()
  table[name]:add(node)
end

function ElementNode:close(closestart, closeend)
  if closestart and closeend then
    self._closestart, self._closeend = closestart, closeend
  end
  -- inform hihger level nodes about this element's existence in their branches
  local node = self
  while true do
    node = node.parent
    if not node then break end
    node.deepernodes:add(self)
    insert(node.deeperelements, self.name, self)
    for k in pairs(self.attributes) do
      insert(node.deeperattributes, k, self)
    end
    if self.id then
      insert(node.deeperids, self.id, self)
    end
    for _,v in ipairs(self.classes) do
      insert(node.deeperclasses, v, self)
    end
  end
end

local function escape(s)
  -- escape all ^, $, (, ), %, ., [, ], *, +, - , and ? with a % prefix
  return string.gsub(s, "([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%" .. "%1")
end

local function select(self, s)
  if not s or type(s) ~= "string" or s == "" then return Set:new() end
  local sets = {[""] = self.deeperelements, ["["] = self.deeperattributes,
                ["#"] = self.deeperids, ["."] = self.deeperclasses}
  local function match(t, w)
    local m, e, v
    if t == "[" then w, m, e, v = string.match(w,
        "([^=|%*~%$!%^]+)" .. -- w = 1 or more characters up to a possible "=", "|", "*", "~", "$", "!", or "^"
        "([|%*~%$!%^]?)" .. -- m = an optional "|", "*", "~", "$", "!", or "^", preceding the optional "="
        "(=?)" .. -- e = the optional "="
        "(.*)" -- v = anything following the "=", or else ""
      )
    end
    local matched = Set:new(sets[t][w])
    -- attribute value selectors
    if e == "=" then
      if #v < 2 then v = "'" .. v .. "'" end -- values should be quoted
      v = string.sub(v, 2, #v - 1) -- strip quotes
      if m == "!" then matched = Set:new(self.deepernodes) end -- include those without that attribute
      for node in pairs(matched) do
        local a = node.attributes[w]
        -- equals
        if m == "" and a ~= v then matched:remove(node)
        -- not equals
        elseif m == "!" and a == v then matched:remove(node)
        -- prefix
        elseif m =="|" and string.match(a, "^[^-]*") ~= v then matched:remove(node)
        -- contains
        elseif m =="*" and string.match(a, escape(v)) ~= v then matched:remove(node)
        -- word
        elseif m =="~" then matched:remove(node)
          for word in string.gmatch(a, "%S+") do
            if word == v then matched:add(node) break end
          end
        -- starts with
        elseif m =="^" and string.match(a, "^" .. escape(v)) ~= v then matched:remove(node)
        -- ends with
        elseif m =="$" and string.match(a, escape(v) .. "$") ~= v then matched:remove(node)
        end
      end -- for node
    end -- if v
    return matched
  end

  local subjects, resultset, childrenonly = Set:new({self})
  for part in string.gmatch(s, "%S+") do
    if part == ">" then
    	childrenonly = true
    	-- Next part
    else
	    resultset = Set:new()
	    for subject in pairs(subjects) do
	      local star = subject.deepernodes
	      if childrenonly then star = Set:new(subject.nodes) end
	      resultset = resultset + star
	    end
	    childrenonly = false
	    if part == "*" then
	    	-- Next part
	    else
		    local excludes, filter = Set:new()
		    for t, w in string.gmatch(part,
		      "([:%[#.]?)" .. -- t = an optional :, [, #, or .
		      "([^:%(%[#.%]%)]+)" .. -- w = 1 or more of anything not :, (, [, #, ., ], or )
		      "%]?%)?" -- followed by an uncaptured optional ] and/or )
		    ) do
		      if t == ":" then
		      	filter = w 
		      	-- nextw
		      else 
			      local matched = match(t, w)
			      if filter == "not" then
			        excludes = excludes + matched
			      else
			        resultset = resultset * matched
			      end
			      filter = nil
			  end
		    end
		    resultset = resultset - excludes
		    subjects = Set:new(resultset)
		end
	end
  end
  return resultset
end

function ElementNode:select(s) return select(self, s) end
ElementNode.mt.__call = select

return ElementNode