# # $Id: htmlparser.icn,v 1.2 2004/04/18 14:20:19 rparlett Exp $ # # This file is in the public domain. # # Author: Robert Parlett (parlett@dial.pipex.com) # package xml global idintro, idchars, keychars, valchars, delim class Tag : Node(name, closed, attributes, empty) # # Set the name of the tag, which is converted to upper case. Any # leading "/" should be omitted. # method set_name(s) return name := map(s, &lcase, &ucase) end # # Succeed if the given tag is a matching closing tag for this tag. # method balances(other) return \other.closed & (name == other.name) end # # Succeed if the tag is a closing tag. # method is_closed() return \closed end # # Indicate that the tag is a closing tag, ie has a leading "/" # method set_closed() return closed := 1 end # # Succeed if the tag is an empty tag, ie of the form <tag/>. # method is_empty() return \empty end # # Indicate that the tag is an empty tag. # method set_empty() return empty := 1 end # # Return the attributes map for this tag. # method get_attributes() return attributes end # # Set an attribute for this tag. # method set_attribute(key, value) return attributes[map(key, &lcase, &ucase)] := value end method get_type() return "tag" end initially(doc) self.Node.initially(doc) self.attributes := table() end class HtmlParser : Parser(doc) # # Parse the string s returning an HtmlDocument object. # method parse(s) doc := HtmlDocument() s ? { repeat { move_to_node() if pos(0) then { # # No opening node found, so put whole in html tags and retry # return parse("<html>" || s || "</html>") } i := &pos node := parse_node() | break case node.get_type() of { "doctype" : doc.add_child(node) "comment" : { doc.add_child(node) } "tag" : { if not(node.is_closed()) & (node.name == "HTML") then { # # Found the html start point, so parse and return that node. # doc.add_child(parse_element(node)) return doc } else { # # Bad structure, so put html tags around it and start again. # return parse(s[1:i] || "<html>" || s[i:0] || "</html>") } } } } } end # # Parse the current input into an {Node} # # @p method parse_node(parent) ="<" | fail if ="!" then { if any(idintro) then return parse_doctype() else return parse_comment(parent) } else return parse_tag() end # # Parse and return an Element, given the starting tag # # @p method parse_element(start_tag, parent) res := HtmlElement() res.name := start_tag.name res.attributes := start_tag.attributes if \debug then print_debug("Parsing node ", start_tag.name) if start_tag.name == "PLAINTEXT" then put(res.children, "" ~== remove_entities(tab(0))) else if not(start_tag.is_empty() | html_is_standalone_tag(start_tag.name)) then { repeat { put(res.children, "" ~== remove_entities(move_to_node())) if pos(0) then break i := &pos node := parse_node(res) | break if node.get_type() == "tag" then { if start_tag.balances(node) then break if html_is_autoclose_tag(start_tag.name) & start_tag.name == node.name then { # # For example <p> ... <p> is treated as <p> ... </p><p>. The second # <p> has to be rescanned by the caller, so tab back. # tab(i) break } if node.is_closed() then { # # Mismatched close. If processing something like a <p>, then assume # a </p> and break; otherwise just ignore. For example : # <p> ... </td> -> <p> ... </p></td> # Again, the </td> needs to be rescanned, so tab back. # if html_is_autoclose_tag(start_tag.name) then { tab(i) break } if \debug then print_debug("Skipping closing tag", node.to_string()) } else { put(res.children, n := parse_element(node, res)) if \debug then print_debug("Adding node ", n.to_string() || " to" || start_tag.to_string()) } } else { # # It's either a coment, or something else; just add it to the list of children # put(res.children, node) } } } if \debug then print_debug("Finished parsing node ", start_tag.to_string()) return res end # # DOCTYPE # # @p method parse_doctype() res := DocType(doc) s := tab(many(idchars)) | fail #res.set_name("!" || s) repeat { tab(upto(valchars ++ delim ++ ">")) if =">" then break if any(delim) then { c := move(1) s := tab(upto(c) | 0) move(1) } else s := tab(many(valchars)) | break #put(res.comments, remove_entities(s)) } return res end # # Comment. # # @p method parse_comment(parent) res := Comment(doc, parent) repeat { tab(upto('->')) if =">" then break ="--" | fail s := tab(find("--") | 0) ="--" res.comment ||:= s } return res end # # Normal tag # # @p method parse_tag() res := Tag(doc) if ="/" then res.set_closed() res.set_name(any(idintro) & tab(many(idchars))) | fail repeat { tab(upto(keychars ++ '/>')) if ="/>" then { res.set_empty() break } if =">" then break key := tab(many(keychars)) | break tab(upto(valchars ++ '>=')) if ="=" then { tab(upto(valchars ++ delim)) if any(delim) then { c := move(1) s := tab(upto(c) | 0) move(1) } else s := tab(many(valchars)) | break value := remove_entities(s) } else value := &null res.set_attribute(key, value) } return res end # # Move to the next node (tag, comment, doctype), or end of file. # # @p method move_to_node() s := "" repeat { s ||:= tab(upto('<') | 0) if pos(0) then break i := &pos if parse_node() then { # # Found a valid node, so tab back and break # tab(i) break } # # Not a valid node, so tab back and continue # tab(i) s ||:= move(1) } if \debug then print_debug("Moved to node") return s end # # Remove the entity characters from a string. # # @p method remove_entities(s) static entities initial { entities := table() every e := ![["amp", "&"], ["lt", "<"], ["gt", ">"], ["quot", "\""], ["nbsp", char(160)], ["iexcl", char(161)], ["cent", char(162)], ["pound", char(163)], ["curren", char(164)], ["yen", char(165)], ["brvbar", char(166)], ["sect", char(167)], ["uml", char(168)], ["copy", char(169)], ["ordf", char(170)], ["laquo", char(171)], ["not", char(172)], ["shy", char(173)], ["reg", char(174)], ["macr", char(175)], ["deg", char(176)], ["plusmn", char(177)], ["sup2", char(178)], ["sup3", char(179)], ["acute", char(180)], ["micro", char(181)], ["para", char(182)], ["middot", char(183)], ["cedil", char(184)], ["sup1", char(185)], ["ordm", char(186)], ["raquo", char(187)], ["frac14", char(188)], ["frac12", char(189)], ["frac34", char(190)], ["iquest", char(191)], ["Agrave", char(192)], ["Aacute", char(193)], ["Acirc", char(194)], ["Atilde", char(195)], ["Auml", char(196)], ["Aring", char(197)], ["AElig", char(198)], ["Ccedil", char(199)], ["Egrave", char(200)], ["Eacute", char(201)], ["Ecirc", char(202)], ["Euml", char(203)], ["Igrave", char(204)], ["Iacute", char(205)], ["Icirc", char(206)], ["Iuml", char(207)], ["ETH", char(208)], ["Ntilde", char(209)], ["Ograve", char(210)], ["Oacute", char(211)], ["Ocirc", char(212)], ["Otilde", char(213)], ["Ouml", char(214)], ["times", char(215)], ["Oslash", char(216)], ["Ugrave", char(217)], ["Uacute", char(218)], ["Ucirc", char(219)], ["Uuml", char(220)], ["Yacute", char(221)], ["THORN", char(222)], ["szlig", char(223)], ["agrave", char(224)], ["aacute", char(225)], ["acirc", char(226)], ["atilde", char(227)], ["auml", char(228)], ["aring", char(229)], ["aelig", char(230)], ["ccedil", char(231)], ["egrave", char(232)], ["eacute", char(233)], ["ecirc", char(234)], ["euml", char(235)], ["igrave", char(236)], ["iacute", char(237)], ["icirc", char(238)], ["iuml", char(239)], ["eth", char(240)], ["ntilde", char(241)], ["ograve", char(242)], ["oacute", char(243)], ["ocirc", char(244)], ["otilde", char(245)], ["ouml", char(246)], ["divide", char(247)], ["oslash", char(248)], ["ugrave", char(249)], ["uacute", char(250)], ["ucirc", char(251)], ["uuml", char(252)], ["yacute", char(253)], ["thorn", char(254)], ["yuml", char(255)]] do entities[e[1]] := e[2] } res := "" s ? { repeat { res ||:= tab(upto('&') | 0) if pos(0) then break ="&" if ="#" then { if i := (256 > integer(tab(many(&digits)))) then { res ||:= char(i) =";" } else res ||:= "&#" } else { if s := \entities[tab(many(&ucase ++ &lcase))] then { res ||:= s =";" } else res ||:= "&" } } } return res end # # Return the document being parsed. # method get_document() return doc end initially() initial { init_html_globals() } end procedure init_html_globals() if \idintro then return idintro := &lcase ++ &ucase idchars := idintro ++ &digits ++ '.-' keychars := &lcase ++ &ucase ++ "_" ++ &digits valchars := &ascii -- ' \t\n\r>=' delim := '\"\'' end # # Succeed if and only if the tag name is a standalone tag, ie it has no # matching closing tag. # procedure html_is_standalone_tag(name) static s initial { s := set() every insert(s, "IMG" | "BR" | "HR" | "META" | "BASE" | "INPUT" | "AREA" | "OPTION" | "BASEFONT" | "BGSOUND" | "COL" | "COLGROUP" | "ISINDEX" | "LINK" | "NEXTID" | "SPACER" | "TBODY" | "TFOOT" | "THEAD" | "WBR") } return member(s, name) end # # Succeed if and only if the tag implicitly closes when a matching opening # tag is encountered at the same level. # procedure html_is_autoclose_tag(name) static s initial { s := set() every insert(s, "LI" | "P" | "DD" | "DT" | "FONT") } return member(s, name) end