1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
12 // A parser implements the HTML5 parsing algorithm:
13 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
15 // tokenizer provides the tokens for the parser.
17 // tok is the most recently read token.
19 // Self-closing tags like <hr/> are re-interpreted as a two-token sequence:
20 // <hr> followed by </hr>. hasSelfClosingToken is true if we have just read
21 // the synthetic start tag and the next one due is the matching end tag.
22 hasSelfClosingToken bool
23 // doc is the document root element.
25 // The stack of open elements (section 11.2.3.2) and active formatting
26 // elements (section 11.2.3.3).
28 // Element pointers (section 11.2.3.4).
30 // Other parsing state flags (section 11.2.3.5).
31 scripting, framesetOK bool
32 // originalIM is the insertion mode to go back to after completing a text
33 // or inTableText insertion mode.
34 originalIM insertionMode
35 // fosterParenting is whether new elements should be inserted according to
36 // the foster parenting rules (section 11.2.5.3).
40 func (p *parser) top() *Node {
41 if n := p.oe.top(); n != nil {
47 // stopTags for use in popUntil. These come from section 11.2.3.2.
49 defaultScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}
50 listItemScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "ol", "ul"}
51 buttonScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "button"}
52 tableScopeStopTags = []string{"html", "table"}
55 // stopTags for use in clearStackToContext.
57 tableRowContextStopTags = []string{"tr", "html"}
60 // popUntil pops the stack of open elements at the highest element whose tag
61 // is in matchTags, provided there is no higher element in stopTags. It returns
62 // whether or not there was such an element. If there was not, popUntil leaves
63 // the stack unchanged.
65 // For example, if the stack was:
66 // ["html", "body", "font", "table", "b", "i", "u"]
67 // then popUntil([]string{"html, "table"}, "font") would return false, but
68 // popUntil([]string{"html, "table"}, "i") would return true and the resultant
70 // ["html", "body", "font", "table", "b"]
72 // If an element's tag is in both stopTags and matchTags, then the stack will
73 // be popped and the function returns true (provided, of course, there was no
74 // higher element in the stack that was also in stopTags). For example,
75 // popUntil([]string{"html, "table"}, "table") would return true and leave:
76 // ["html", "body", "font"]
77 func (p *parser) popUntil(stopTags []string, matchTags ...string) bool {
78 if i := p.indexOfElementInScope(stopTags, matchTags...); i != -1 {
85 // indexOfElementInScope returns the index in p.oe of the highest element
86 // whose tag is in matchTags that is in scope according to stopTags.
87 // If no matching element is in scope, it returns -1.
88 func (p *parser) indexOfElementInScope(stopTags []string, matchTags ...string) int {
89 for i := len(p.oe) - 1; i >= 0; i-- {
91 for _, t := range matchTags {
96 for _, t := range stopTags {
105 // elementInScope is like popUntil, except that it doesn't modify the stack of
107 func (p *parser) elementInScope(stopTags []string, matchTags ...string) bool {
108 return p.indexOfElementInScope(stopTags, matchTags...) != -1
111 // addChild adds a child node n to the top element, and pushes n onto the stack
112 // of open elements if it is an element node.
113 func (p *parser) addChild(n *Node) {
114 if p.fosterParenting {
120 if n.Type == ElementNode {
121 p.oe = append(p.oe, n)
125 // fosterParent adds a child node according to the foster parenting rules.
126 // Section 11.2.5.3, "foster parenting".
127 func (p *parser) fosterParent(n *Node) {
128 p.fosterParenting = false
129 var table, parent *Node
131 for i = len(p.oe) - 1; i >= 0; i-- {
132 if p.oe[i].Data == "table" {
139 // The foster parent is the html element.
142 parent = table.Parent
149 for i, child = range parent.Child {
155 if i > 0 && parent.Child[i-1].Type == TextNode && n.Type == TextNode {
156 parent.Child[i-1].Data += n.Data
160 if i == len(parent.Child) {
163 // Insert n into parent.Child at index i.
164 parent.Child = append(parent.Child[:i+1], parent.Child[i:]...)
170 // addText adds text to the preceding node if it is a text node, or else it
171 // calls addChild with a new text node.
172 func (p *parser) addText(text string) {
173 // TODO: distinguish whitespace text from others.
175 if i := len(t.Child); i > 0 && t.Child[i-1].Type == TextNode {
176 t.Child[i-1].Data += text
185 // addElement calls addChild with an element node.
186 func (p *parser) addElement(tag string, attr []Attribute) {
195 func (p *parser) addFormattingElement(tag string, attr []Attribute) {
196 p.addElement(tag, attr)
197 p.afe = append(p.afe, p.top())
202 func (p *parser) clearActiveFormattingElements() {
205 if len(p.afe) == 0 || n.Type == scopeMarkerNode {
212 func (p *parser) reconstructActiveFormattingElements() {
217 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
221 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
231 clone := p.afe[i].clone()
234 if i == len(p.afe)-1 {
240 // read reads the next token. This is usually from the tokenizer, but it may
241 // be the synthesized end tag implied by a self-closing tag.
242 func (p *parser) read() error {
243 if p.hasSelfClosingToken {
244 p.hasSelfClosingToken = false
245 p.tok.Type = EndTagToken
250 p.tok = p.tokenizer.Token()
253 return p.tokenizer.Err()
254 case SelfClosingTagToken:
255 p.hasSelfClosingToken = true
256 p.tok.Type = StartTagToken
262 func (p *parser) acknowledgeSelfClosingTag() {
263 p.hasSelfClosingToken = false
266 // An insertion mode (section 11.2.3.1) is the state transition function from
267 // a particular state in the HTML5 parser's state machine. It updates the
268 // parser's fields depending on parser.token (where ErrorToken means EOF). In
269 // addition to returning the next insertionMode state, it also returns whether
270 // the token was consumed.
271 type insertionMode func(*parser) (insertionMode, bool)
273 // useTheRulesFor runs the delegate insertionMode over p, returning the actual
274 // insertionMode unless the delegate caused a state transition.
275 // Section 11.2.3.1, "using the rules for".
276 func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) {
277 im, consumed := delegate(p)
278 if p.originalIM == delegate {
279 p.originalIM = actual
284 return actual, consumed
287 // setOriginalIM sets the insertion mode to return to after completing a text or
288 // inTableText insertion mode.
289 // Section 11.2.3.1, "using the rules for".
290 func (p *parser) setOriginalIM(im insertionMode) {
291 if p.originalIM != nil {
292 panic("html: bad parser state: originalIM was set twice")
297 // Section 11.2.3.1, "reset the insertion mode".
298 func (p *parser) resetInsertionMode() insertionMode {
299 for i := len(p.oe) - 1; i >= 0; i-- {
302 // TODO: set n to the context element, for HTML fragment parsing.
311 case "tbody", "thead", "tfoot":
314 // TODO: return inCaptionIM
316 // TODO: return inColumnGroupIM
324 // TODO: return inFramesetIM
332 // Section 11.2.5.4.1.
333 func initialIM(p *parser) (insertionMode, bool) {
340 return initialIM, true
346 return beforeHTMLIM, true
348 // TODO: set "quirks mode"? It's defined in the DOM spec instead of HTML5 proper,
349 // and so switching on "quirks mode" might belong in a different package.
350 return beforeHTMLIM, false
353 // Section 11.2.5.4.2.
354 func beforeHTMLIM(p *parser) (insertionMode, bool) {
364 // TODO: distinguish whitespace text from others.
367 if p.tok.Data == "html" {
375 case "head", "body", "html", "br":
385 return beforeHTMLIM, true
388 p.addElement("html", attr)
390 return beforeHeadIM, !implied
393 // Section 11.2.5.4.3.
394 func beforeHeadIM(p *parser) (insertionMode, bool) {
404 // TODO: distinguish whitespace text from others.
412 return useTheRulesFor(p, beforeHeadIM, inBodyIM)
418 case "head", "body", "html", "br":
428 return beforeHeadIM, true
431 p.addElement("head", attr)
434 return inHeadIM, !implied
437 const whitespace = " \t\r\n\f"
439 // Section 11.2.5.4.4.
440 func inHeadIM(p *parser) (insertionMode, bool) {
449 s := strings.TrimLeft(p.tok.Data, whitespace)
450 if len(s) < len(p.tok.Data) {
451 // Add the initial whitespace to the current node.
452 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
454 return inHeadIM, true
461 case "base", "basefont", "bgsound", "command", "link", "meta":
462 p.addElement(p.tok.Data, p.tok.Attr)
464 p.acknowledgeSelfClosingTag()
465 case "script", "title", "noscript", "noframes", "style":
466 p.addElement(p.tok.Data, p.tok.Attr)
467 p.setOriginalIM(inHeadIM)
473 if p.tok.Data == "head" {
482 return inHeadIM, true
486 if n.Data != "head" {
487 panic("html: bad parser state: <head> element not found, in the in-head insertion mode")
489 return afterHeadIM, !implied
491 return inHeadIM, true
494 // Section 11.2.5.4.6.
495 func afterHeadIM(p *parser) (insertionMode, bool) {
503 case ErrorToken, TextToken:
516 case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
517 p.oe = append(p.oe, p.head)
519 return useTheRulesFor(p, afterHeadIM, inHeadIM)
533 return afterHeadIM, true
536 p.addElement("body", attr)
537 p.framesetOK = framesetOK
539 return inBodyIM, !implied
542 // copyAttributes copies attributes of src not found on dst to dst.
543 func copyAttributes(dst *Node, src Token) {
544 if len(src.Attr) == 0 {
547 attr := map[string]string{}
548 for _, a := range dst.Attr {
551 for _, a := range src.Attr {
552 if _, ok := attr[a.Key]; !ok {
553 dst.Attr = append(dst.Attr, a)
559 // Section 11.2.5.4.7.
560 func inBodyIM(p *parser) (insertionMode, bool) {
563 p.reconstructActiveFormattingElements()
564 p.addText(p.tok.Data)
568 case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
569 p.popUntil(buttonScopeStopTags, "p")
570 p.addElement(p.tok.Data, p.tok.Attr)
571 case "h1", "h2", "h3", "h4", "h5", "h6":
572 p.popUntil(buttonScopeStopTags, "p")
573 switch n := p.top(); n.Data {
574 case "h1", "h2", "h3", "h4", "h5", "h6":
577 p.addElement(p.tok.Data, p.tok.Attr)
579 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
580 if n := p.afe[i]; n.Type == ElementNode && n.Data == "a" {
581 p.inBodyEndTagFormatting("a")
587 p.reconstructActiveFormattingElements()
588 p.addFormattingElement(p.tok.Data, p.tok.Attr)
589 case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
590 p.reconstructActiveFormattingElements()
591 p.addFormattingElement(p.tok.Data, p.tok.Attr)
592 case "applet", "marquee", "object":
593 p.reconstructActiveFormattingElements()
594 p.addElement(p.tok.Data, p.tok.Attr)
595 p.afe = append(p.afe, &scopeMarker)
597 case "area", "br", "embed", "img", "input", "keygen", "wbr":
598 p.reconstructActiveFormattingElements()
599 p.addElement(p.tok.Data, p.tok.Attr)
601 p.acknowledgeSelfClosingTag()
604 p.popUntil(buttonScopeStopTags, "p") // TODO: skip this step in quirks mode.
605 p.addElement(p.tok.Data, p.tok.Attr)
607 return inTableIM, true
609 p.popUntil(buttonScopeStopTags, "p")
610 p.addElement(p.tok.Data, p.tok.Attr)
612 p.acknowledgeSelfClosingTag()
615 p.reconstructActiveFormattingElements()
616 p.addElement(p.tok.Data, p.tok.Attr)
618 // TODO: detect <select> inside a table.
619 return inSelectIM, true
622 for i := len(p.oe) - 1; i >= 0; i-- {
626 p.popUntil(listItemScopeStopTags, "li")
627 case "address", "div", "p":
630 if !isSpecialElement[node.Data] {
636 p.popUntil(buttonScopeStopTags, "p")
637 p.addElement("li", p.tok.Attr)
638 case "optgroup", "option":
639 if p.top().Data == "option" {
642 p.reconstructActiveFormattingElements()
643 p.addElement(p.tok.Data, p.tok.Attr)
647 if body.Type == ElementNode && body.Data == "body" {
649 copyAttributes(body, p.tok)
652 case "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title":
653 return useTheRulesFor(p, inBodyIM, inHeadIM)
656 return inBodyIM, false
659 p.addElement(p.tok.Data, p.tok.Attr)
664 // TODO: autoclose the stack of open elements.
665 return afterBodyIM, true
667 if !p.elementInScope(buttonScopeStopTags, "p") {
668 p.addElement("p", nil)
670 p.popUntil(buttonScopeStopTags, "p")
671 case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
672 p.inBodyEndTagFormatting(p.tok.Data)
673 case "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul":
674 p.popUntil(defaultScopeStopTags, p.tok.Data)
675 case "applet", "marquee", "object":
676 if p.popUntil(defaultScopeStopTags, p.tok.Data) {
677 p.clearActiveFormattingElements()
680 p.inBodyEndTagOther(p.tok.Data)
689 return inBodyIM, true
692 func (p *parser) inBodyEndTagFormatting(tag string) {
693 // This is the "adoption agency" algorithm, described at
694 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
696 // TODO: this is a fairly literal line-by-line translation of that algorithm.
697 // Once the code successfully parses the comprehensive test suite, we should
698 // refactor this code to be more idiomatic.
700 // Steps 1-3. The outer loop.
701 for i := 0; i < 8; i++ {
702 // Step 4. Find the formatting element.
703 var formattingElement *Node
704 for j := len(p.afe) - 1; j >= 0; j-- {
705 if p.afe[j].Type == scopeMarkerNode {
708 if p.afe[j].Data == tag {
709 formattingElement = p.afe[j]
713 if formattingElement == nil {
714 p.inBodyEndTagOther(tag)
717 feIndex := p.oe.index(formattingElement)
719 p.afe.remove(formattingElement)
722 if !p.elementInScope(defaultScopeStopTags, tag) {
727 // Steps 5-6. Find the furthest block.
728 var furthestBlock *Node
729 for _, e := range p.oe[feIndex:] {
730 if isSpecialElement[e.Data] {
735 if furthestBlock == nil {
737 for e != formattingElement {
744 // Steps 7-8. Find the common ancestor and bookmark node.
745 commonAncestor := p.oe[feIndex-1]
746 bookmark := p.afe.index(formattingElement)
748 // Step 9. The inner loop. Find the lastNode to reparent.
749 lastNode := furthestBlock
750 node := furthestBlock
751 x := p.oe.index(node)
753 for j := 0; j < 3; j++ {
758 if p.afe.index(node) == -1 {
763 if node == formattingElement {
767 clone := node.clone()
768 p.afe[p.afe.index(node)] = clone
769 p.oe[p.oe.index(node)] = clone
772 if lastNode == furthestBlock {
773 bookmark = p.afe.index(node) + 1
776 if lastNode.Parent != nil {
777 lastNode.Parent.Remove(lastNode)
784 // Step 10. Reparent lastNode to the common ancestor,
785 // or for misnested table nodes, to the foster parent.
786 if lastNode.Parent != nil {
787 lastNode.Parent.Remove(lastNode)
789 switch commonAncestor.Data {
790 case "table", "tbody", "tfoot", "thead", "tr":
791 p.fosterParent(lastNode)
793 commonAncestor.Add(lastNode)
796 // Steps 11-13. Reparent nodes from the furthest block's children
797 // to a clone of the formatting element.
798 clone := formattingElement.clone()
799 reparentChildren(clone, furthestBlock)
800 furthestBlock.Add(clone)
802 // Step 14. Fix up the list of active formatting elements.
803 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
804 // Move the bookmark with the rest of the list.
807 p.afe.remove(formattingElement)
808 p.afe.insert(bookmark, clone)
810 // Step 15. Fix up the stack of open elements.
811 p.oe.remove(formattingElement)
812 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
816 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
817 func (p *parser) inBodyEndTagOther(tag string) {
818 for i := len(p.oe) - 1; i >= 0; i-- {
819 if p.oe[i].Data == tag {
823 if isSpecialElement[p.oe[i].Data] {
829 // Section 11.2.5.4.8.
830 func textIM(p *parser) (insertionMode, bool) {
835 p.addText(p.tok.Data)
842 return o, p.tok.Type == EndTagToken
845 // Section 11.2.5.4.9.
846 func inTableIM(p *parser) (insertionMode, bool) {
855 case "tbody", "tfoot", "thead":
856 p.clearStackToContext(tableScopeStopTags)
857 p.addElement(p.tok.Data, p.tok.Attr)
858 return inTableBodyIM, true
859 case "td", "th", "tr":
860 p.clearStackToContext(tableScopeStopTags)
861 p.addElement("tbody", nil)
862 return inTableBodyIM, false
864 if p.popUntil(tableScopeStopTags, "table") {
865 return p.resetInsertionMode(), false
868 return inTableIM, true
875 if p.popUntil(tableScopeStopTags, "table") {
876 return p.resetInsertionMode(), true
879 return inTableIM, true
880 case "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr":
882 return inTableIM, true
889 return inTableIM, true
892 switch p.top().Data {
893 case "table", "tbody", "tfoot", "thead", "tr":
894 p.fosterParenting = true
895 defer func() { p.fosterParenting = false }()
898 return useTheRulesFor(p, inTableIM, inBodyIM)
901 // clearStackToContext pops elements off the stack of open elements
902 // until an element listed in stopTags is found.
903 func (p *parser) clearStackToContext(stopTags []string) {
904 for i := len(p.oe) - 1; i >= 0; i-- {
905 for _, tag := range stopTags {
906 if p.oe[i].Data == tag {
914 // Section 11.2.5.4.13.
915 func inTableBodyIM(p *parser) (insertionMode, bool) {
944 if p.popUntil(tableScopeStopTags, "tbody", "thead", "tfoot") {
945 return inTableIM, false
948 return inTableBodyIM, true
949 case "body", "caption", "col", "colgroup", "html", "td", "th", "tr":
951 return inTableBodyIM, true
958 return inTableBodyIM, true
961 // TODO: clear the stack back to a table body context.
962 p.addElement(data, attr)
963 return inRowIM, consumed
965 return useTheRulesFor(p, inTableBodyIM, inTableIM)
968 // Section 11.2.5.4.14.
969 func inRowIM(p *parser) (insertionMode, bool) {
978 p.clearStackToContext(tableRowContextStopTags)
979 p.addElement(p.tok.Data, p.tok.Attr)
980 p.afe = append(p.afe, &scopeMarker)
981 return inCellIM, true
982 case "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr":
983 if p.popUntil(tableScopeStopTags, "tr") {
984 return inTableBodyIM, false
994 if p.popUntil(tableScopeStopTags, "tr") {
995 return inTableBodyIM, true
1000 if p.popUntil(tableScopeStopTags, "tr") {
1001 return inTableBodyIM, false
1003 // Ignore the token.
1004 return inRowIM, true
1005 case "tbody", "tfoot", "thead":
1007 case "body", "caption", "col", "colgroup", "html", "td", "th":
1008 // Ignore the token.
1009 return inRowIM, true
1018 return inRowIM, true
1020 return useTheRulesFor(p, inRowIM, inTableIM)
1023 // Section 11.2.5.4.15.
1024 func inCellIM(p *parser) (insertionMode, bool) {
1026 closeTheCellAndReprocess bool
1031 case "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr":
1032 // TODO: check for "td" or "th" in table scope.
1033 closeTheCellAndReprocess = true
1038 if !p.popUntil(tableScopeStopTags, p.tok.Data) {
1039 // Ignore the token.
1040 return inCellIM, true
1042 p.clearActiveFormattingElements()
1043 return inRowIM, true
1044 case "body", "caption", "col", "colgroup", "html":
1046 case "table", "tbody", "tfoot", "thead", "tr":
1047 // TODO: check for matching element in table scope.
1048 closeTheCellAndReprocess = true
1055 return inCellIM, true
1057 if closeTheCellAndReprocess {
1058 if p.popUntil(tableScopeStopTags, "td") || p.popUntil(tableScopeStopTags, "th") {
1059 p.clearActiveFormattingElements()
1060 return inRowIM, false
1063 return useTheRulesFor(p, inCellIM, inBodyIM)
1066 // Section 11.2.5.4.16.
1067 func inSelectIM(p *parser) (insertionMode, bool) {
1073 p.addText(p.tok.Data)
1079 if p.top().Data == "option" {
1082 p.addElement(p.tok.Data, p.tok.Attr)
1087 case "input", "keygen", "textarea":
1092 // Ignore the token.
1103 // Ignore the token.
1112 for i := len(p.oe) - 1; i >= 0; i-- {
1113 switch p.oe[i].Data {
1116 return p.resetInsertionMode(), true
1117 case "option", "optgroup":
1120 // Ignore the token.
1121 return inSelectIM, true
1125 return inSelectIM, true
1128 // Section 11.2.5.4.18.
1129 func afterBodyIM(p *parser) (insertionMode, bool) {
1140 // TODO: autoclose the stack of open elements.
1141 return afterAfterBodyIM, true
1146 // The comment is attached to the <html> element.
1147 if len(p.oe) < 1 || p.oe[0].Data != "html" {
1148 panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1154 return afterBodyIM, true
1156 // TODO: should this be "return inBodyIM, true"?
1157 return afterBodyIM, true
1160 // Section 11.2.5.4.21.
1161 func afterAfterBodyIM(p *parser) (insertionMode, bool) {
1169 if p.tok.Data == "html" {
1170 return useTheRulesFor(p, afterAfterBodyIM, inBodyIM)
1177 return afterAfterBodyIM, true
1179 return inBodyIM, false
1182 // Parse returns the parse tree for the HTML from the given Reader.
1183 // The input is assumed to be UTF-8 encoded.
1184 func Parse(r io.Reader) (*Node, error) {
1186 tokenizer: NewTokenizer(r),
1193 // Iterate until EOF. Any other error will cause an early return.
1194 im, consumed := initialIM, true
1197 if err := p.read(); err != nil {
1204 im, consumed = im(p)
1206 // Loop until the final token (the ErrorToken signifying EOF) is consumed.
1208 if im, consumed = im(p); consumed {