OSDN Git Service

libgo: Update to weekly.2011-11-18.
[pf3gnuchains/gcc-fork.git] / libgo / go / html / parse.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8         "io"
9         "strings"
10 )
11
12 // A parser implements the HTML5 parsing algorithm:
13 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
14 type parser struct {
15         // tokenizer provides the tokens for the parser.
16         tokenizer *Tokenizer
17         // tok is the most recently read token.
18         tok Token
19         // Self-closing tags like <hr/> are re-interpreted as a two-token sequence:
20         // <hr> followed by </hr>. hasSelfClosingToken is true if we have just read
21         // the synthetic start tag and the next one due is the matching end tag.
22         hasSelfClosingToken bool
23         // doc is the document root element.
24         doc *Node
25         // The stack of open elements (section 11.2.3.2) and active formatting
26         // elements (section 11.2.3.3).
27         oe, afe nodeStack
28         // Element pointers (section 11.2.3.4).
29         head, form *Node
30         // Other parsing state flags (section 11.2.3.5).
31         scripting, framesetOK bool
32         // im is the current insertion mode.
33         im insertionMode
34         // originalIM is the insertion mode to go back to after completing a text
35         // or inTableText insertion mode.
36         originalIM insertionMode
37         // fosterParenting is whether new elements should be inserted according to
38         // the foster parenting rules (section 11.2.5.3).
39         fosterParenting bool
40 }
41
42 func (p *parser) top() *Node {
43         if n := p.oe.top(); n != nil {
44                 return n
45         }
46         return p.doc
47 }
48
49 // stopTags for use in popUntil. These come from section 11.2.3.2.
50 var (
51         defaultScopeStopTags  = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}
52         listItemScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "ol", "ul"}
53         buttonScopeStopTags   = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "button"}
54         tableScopeStopTags    = []string{"html", "table"}
55 )
56
57 // stopTags for use in clearStackToContext.
58 var (
59         tableRowContextStopTags = []string{"tr", "html"}
60 )
61
62 // popUntil pops the stack of open elements at the highest element whose tag
63 // is in matchTags, provided there is no higher element in stopTags. It returns
64 // whether or not there was such an element. If there was not, popUntil leaves
65 // the stack unchanged.
66 //
67 // For example, if the stack was:
68 // ["html", "body", "font", "table", "b", "i", "u"]
69 // then popUntil([]string{"html, "table"}, "font") would return false, but
70 // popUntil([]string{"html, "table"}, "i") would return true and the resultant
71 // stack would be:
72 // ["html", "body", "font", "table", "b"]
73 //
74 // If an element's tag is in both stopTags and matchTags, then the stack will
75 // be popped and the function returns true (provided, of course, there was no
76 // higher element in the stack that was also in stopTags). For example,
77 // popUntil([]string{"html, "table"}, "table") would return true and leave:
78 // ["html", "body", "font"]
79 func (p *parser) popUntil(stopTags []string, matchTags ...string) bool {
80         if i := p.indexOfElementInScope(stopTags, matchTags...); i != -1 {
81                 p.oe = p.oe[:i]
82                 return true
83         }
84         return false
85 }
86
87 // indexOfElementInScope returns the index in p.oe of the highest element
88 // whose tag is in matchTags that is in scope according to stopTags.
89 // If no matching element is in scope, it returns -1.
90 func (p *parser) indexOfElementInScope(stopTags []string, matchTags ...string) int {
91         for i := len(p.oe) - 1; i >= 0; i-- {
92                 tag := p.oe[i].Data
93                 for _, t := range matchTags {
94                         if t == tag {
95                                 return i
96                         }
97                 }
98                 for _, t := range stopTags {
99                         if t == tag {
100                                 return -1
101                         }
102                 }
103         }
104         return -1
105 }
106
107 // elementInScope is like popUntil, except that it doesn't modify the stack of
108 // open elements.
109 func (p *parser) elementInScope(stopTags []string, matchTags ...string) bool {
110         return p.indexOfElementInScope(stopTags, matchTags...) != -1
111 }
112
113 // addChild adds a child node n to the top element, and pushes n onto the stack
114 // of open elements if it is an element node.
115 func (p *parser) addChild(n *Node) {
116         if p.fosterParenting {
117                 p.fosterParent(n)
118         } else {
119                 p.top().Add(n)
120         }
121
122         if n.Type == ElementNode {
123                 p.oe = append(p.oe, n)
124         }
125 }
126
127 // fosterParent adds a child node according to the foster parenting rules.
128 // Section 11.2.5.3, "foster parenting".
129 func (p *parser) fosterParent(n *Node) {
130         p.fosterParenting = false
131         var table, parent *Node
132         var i int
133         for i = len(p.oe) - 1; i >= 0; i-- {
134                 if p.oe[i].Data == "table" {
135                         table = p.oe[i]
136                         break
137                 }
138         }
139
140         if table == nil {
141                 // The foster parent is the html element.
142                 parent = p.oe[0]
143         } else {
144                 parent = table.Parent
145         }
146         if parent == nil {
147                 parent = p.oe[i-1]
148         }
149
150         var child *Node
151         for i, child = range parent.Child {
152                 if child == table {
153                         break
154                 }
155         }
156
157         if i > 0 && parent.Child[i-1].Type == TextNode && n.Type == TextNode {
158                 parent.Child[i-1].Data += n.Data
159                 return
160         }
161
162         if i == len(parent.Child) {
163                 parent.Add(n)
164         } else {
165                 // Insert n into parent.Child at index i.
166                 parent.Child = append(parent.Child[:i+1], parent.Child[i:]...)
167                 parent.Child[i] = n
168                 n.Parent = parent
169         }
170 }
171
172 // addText adds text to the preceding node if it is a text node, or else it
173 // calls addChild with a new text node.
174 func (p *parser) addText(text string) {
175         // TODO: distinguish whitespace text from others.
176         t := p.top()
177         if i := len(t.Child); i > 0 && t.Child[i-1].Type == TextNode {
178                 t.Child[i-1].Data += text
179                 return
180         }
181         p.addChild(&Node{
182                 Type: TextNode,
183                 Data: text,
184         })
185 }
186
187 // addElement calls addChild with an element node.
188 func (p *parser) addElement(tag string, attr []Attribute) {
189         p.addChild(&Node{
190                 Type: ElementNode,
191                 Data: tag,
192                 Attr: attr,
193         })
194 }
195
196 // Section 11.2.3.3.
197 func (p *parser) addFormattingElement(tag string, attr []Attribute) {
198         p.addElement(tag, attr)
199         p.afe = append(p.afe, p.top())
200         // TODO.
201 }
202
203 // Section 11.2.3.3.
204 func (p *parser) clearActiveFormattingElements() {
205         for {
206                 n := p.afe.pop()
207                 if len(p.afe) == 0 || n.Type == scopeMarkerNode {
208                         return
209                 }
210         }
211 }
212
213 // Section 11.2.3.3.
214 func (p *parser) reconstructActiveFormattingElements() {
215         n := p.afe.top()
216         if n == nil {
217                 return
218         }
219         if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
220                 return
221         }
222         i := len(p.afe) - 1
223         for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
224                 if i == 0 {
225                         i = -1
226                         break
227                 }
228                 i--
229                 n = p.afe[i]
230         }
231         for {
232                 i++
233                 clone := p.afe[i].clone()
234                 p.addChild(clone)
235                 p.afe[i] = clone
236                 if i == len(p.afe)-1 {
237                         break
238                 }
239         }
240 }
241
242 // read reads the next token. This is usually from the tokenizer, but it may
243 // be the synthesized end tag implied by a self-closing tag.
244 func (p *parser) read() error {
245         if p.hasSelfClosingToken {
246                 p.hasSelfClosingToken = false
247                 p.tok.Type = EndTagToken
248                 p.tok.Attr = nil
249                 return nil
250         }
251         p.tokenizer.Next()
252         p.tok = p.tokenizer.Token()
253         switch p.tok.Type {
254         case ErrorToken:
255                 return p.tokenizer.Err()
256         case SelfClosingTagToken:
257                 p.hasSelfClosingToken = true
258                 p.tok.Type = StartTagToken
259         }
260         return nil
261 }
262
263 // Section 11.2.4.
264 func (p *parser) acknowledgeSelfClosingTag() {
265         p.hasSelfClosingToken = false
266 }
267
268 // An insertion mode (section 11.2.3.1) is the state transition function from
269 // a particular state in the HTML5 parser's state machine. It updates the
270 // parser's fields depending on parser.tok (where ErrorToken means EOF).
271 // It returns whether the token was consumed.
272 type insertionMode func(*parser) bool
273
274 // setOriginalIM sets the insertion mode to return to after completing a text or
275 // inTableText insertion mode.
276 // Section 11.2.3.1, "using the rules for".
277 func (p *parser) setOriginalIM() {
278         if p.originalIM != nil {
279                 panic("html: bad parser state: originalIM was set twice")
280         }
281         p.originalIM = p.im
282 }
283
284 // Section 11.2.3.1, "reset the insertion mode".
285 func (p *parser) resetInsertionMode() {
286         for i := len(p.oe) - 1; i >= 0; i-- {
287                 n := p.oe[i]
288                 if i == 0 {
289                         // TODO: set n to the context element, for HTML fragment parsing.
290                 }
291                 switch n.Data {
292                 case "select":
293                         p.im = inSelectIM
294                 case "td", "th":
295                         p.im = inCellIM
296                 case "tr":
297                         p.im = inRowIM
298                 case "tbody", "thead", "tfoot":
299                         p.im = inTableBodyIM
300                 case "caption":
301                         p.im = inCaptionIM
302                 case "colgroup":
303                         p.im = inColumnGroupIM
304                 case "table":
305                         p.im = inTableIM
306                 case "head":
307                         p.im = inBodyIM
308                 case "body":
309                         p.im = inBodyIM
310                 case "frameset":
311                         p.im = inFramesetIM
312                 case "html":
313                         p.im = beforeHeadIM
314                 default:
315                         continue
316                 }
317                 return
318         }
319         p.im = inBodyIM
320 }
321
322 // Section 11.2.5.4.1.
323 func initialIM(p *parser) bool {
324         switch p.tok.Type {
325         case CommentToken:
326                 p.doc.Add(&Node{
327                         Type: CommentNode,
328                         Data: p.tok.Data,
329                 })
330                 return true
331         case DoctypeToken:
332                 p.doc.Add(&Node{
333                         Type: DoctypeNode,
334                         Data: p.tok.Data,
335                 })
336                 p.im = beforeHTMLIM
337                 return true
338         }
339         // TODO: set "quirks mode"? It's defined in the DOM spec instead of HTML5 proper,
340         // and so switching on "quirks mode" might belong in a different package.
341         p.im = beforeHTMLIM
342         return false
343 }
344
345 // Section 11.2.5.4.2.
346 func beforeHTMLIM(p *parser) bool {
347         switch p.tok.Type {
348         case StartTagToken:
349                 if p.tok.Data == "html" {
350                         p.addElement(p.tok.Data, p.tok.Attr)
351                         p.im = beforeHeadIM
352                         return true
353                 }
354         case EndTagToken:
355                 switch p.tok.Data {
356                 case "head", "body", "html", "br":
357                         // Drop down to creating an implied <html> tag.
358                 default:
359                         // Ignore the token.
360                         return true
361                 }
362         case CommentToken:
363                 p.doc.Add(&Node{
364                         Type: CommentNode,
365                         Data: p.tok.Data,
366                 })
367                 return true
368         }
369         // Create an implied <html> tag.
370         p.addElement("html", nil)
371         p.im = beforeHeadIM
372         return false
373 }
374
375 // Section 11.2.5.4.3.
376 func beforeHeadIM(p *parser) bool {
377         var (
378                 add     bool
379                 attr    []Attribute
380                 implied bool
381         )
382         switch p.tok.Type {
383         case ErrorToken:
384                 implied = true
385         case TextToken:
386                 // TODO: distinguish whitespace text from others.
387                 implied = true
388         case StartTagToken:
389                 switch p.tok.Data {
390                 case "head":
391                         add = true
392                         attr = p.tok.Attr
393                 case "html":
394                         return inBodyIM(p)
395                 default:
396                         implied = true
397                 }
398         case EndTagToken:
399                 switch p.tok.Data {
400                 case "head", "body", "html", "br":
401                         implied = true
402                 default:
403                         // Ignore the token.
404                 }
405         case CommentToken:
406                 p.addChild(&Node{
407                         Type: CommentNode,
408                         Data: p.tok.Data,
409                 })
410                 return true
411         }
412         if add || implied {
413                 p.addElement("head", attr)
414                 p.head = p.top()
415         }
416         p.im = inHeadIM
417         return !implied
418 }
419
420 const whitespace = " \t\r\n\f"
421
422 // Section 11.2.5.4.4.
423 func inHeadIM(p *parser) bool {
424         var (
425                 pop     bool
426                 implied bool
427         )
428         switch p.tok.Type {
429         case ErrorToken:
430                 implied = true
431         case TextToken:
432                 s := strings.TrimLeft(p.tok.Data, whitespace)
433                 if len(s) < len(p.tok.Data) {
434                         // Add the initial whitespace to the current node.
435                         p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
436                         if s == "" {
437                                 return true
438                         }
439                         p.tok.Data = s
440                 }
441                 implied = true
442         case StartTagToken:
443                 switch p.tok.Data {
444                 case "base", "basefont", "bgsound", "command", "link", "meta":
445                         p.addElement(p.tok.Data, p.tok.Attr)
446                         p.oe.pop()
447                         p.acknowledgeSelfClosingTag()
448                 case "script", "title", "noscript", "noframes", "style":
449                         p.addElement(p.tok.Data, p.tok.Attr)
450                         p.setOriginalIM()
451                         p.im = textIM
452                         return true
453                 default:
454                         implied = true
455                 }
456         case EndTagToken:
457                 switch p.tok.Data {
458                 case "head":
459                         pop = true
460                 case "body", "html", "br":
461                         implied = true
462                 default:
463                         // Ignore the token.
464                         return true
465                 }
466         case CommentToken:
467                 p.addChild(&Node{
468                         Type: CommentNode,
469                         Data: p.tok.Data,
470                 })
471                 return true
472         }
473         if pop || implied {
474                 n := p.oe.pop()
475                 if n.Data != "head" {
476                         panic("html: bad parser state: <head> element not found, in the in-head insertion mode")
477                 }
478                 p.im = afterHeadIM
479                 return !implied
480         }
481         return true
482 }
483
484 // Section 11.2.5.4.6.
485 func afterHeadIM(p *parser) bool {
486         var (
487                 add        bool
488                 attr       []Attribute
489                 framesetOK bool
490                 implied    bool
491         )
492         switch p.tok.Type {
493         case ErrorToken, TextToken:
494                 implied = true
495                 framesetOK = true
496         case StartTagToken:
497                 switch p.tok.Data {
498                 case "html":
499                         // TODO.
500                 case "body":
501                         add = true
502                         attr = p.tok.Attr
503                         framesetOK = false
504                 case "frameset":
505                         p.addElement(p.tok.Data, p.tok.Attr)
506                         p.im = inFramesetIM
507                         return true
508                 case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
509                         p.oe = append(p.oe, p.head)
510                         defer p.oe.pop()
511                         return inHeadIM(p)
512                 case "head":
513                         // TODO.
514                 default:
515                         implied = true
516                         framesetOK = true
517                 }
518         case EndTagToken:
519                 switch p.tok.Data {
520                 case "body", "html", "br":
521                         implied = true
522                         framesetOK = true
523                 default:
524                         // Ignore the token.
525                         return true
526                 }
527         case CommentToken:
528                 p.addChild(&Node{
529                         Type: CommentNode,
530                         Data: p.tok.Data,
531                 })
532                 return true
533         }
534         if add || implied {
535                 p.addElement("body", attr)
536                 p.framesetOK = framesetOK
537         }
538         p.im = inBodyIM
539         return !implied
540 }
541
542 // copyAttributes copies attributes of src not found on dst to dst.
543 func copyAttributes(dst *Node, src Token) {
544         if len(src.Attr) == 0 {
545                 return
546         }
547         attr := map[string]string{}
548         for _, a := range dst.Attr {
549                 attr[a.Key] = a.Val
550         }
551         for _, a := range src.Attr {
552                 if _, ok := attr[a.Key]; !ok {
553                         dst.Attr = append(dst.Attr, a)
554                         attr[a.Key] = a.Val
555                 }
556         }
557 }
558
559 // Section 11.2.5.4.7.
560 func inBodyIM(p *parser) bool {
561         switch p.tok.Type {
562         case TextToken:
563                 p.reconstructActiveFormattingElements()
564                 p.addText(p.tok.Data)
565                 p.framesetOK = false
566         case StartTagToken:
567                 switch p.tok.Data {
568                 case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
569                         p.popUntil(buttonScopeStopTags, "p")
570                         p.addElement(p.tok.Data, p.tok.Attr)
571                 case "h1", "h2", "h3", "h4", "h5", "h6":
572                         p.popUntil(buttonScopeStopTags, "p")
573                         switch n := p.top(); n.Data {
574                         case "h1", "h2", "h3", "h4", "h5", "h6":
575                                 p.oe.pop()
576                         }
577                         p.addElement(p.tok.Data, p.tok.Attr)
578                 case "a":
579                         for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
580                                 if n := p.afe[i]; n.Type == ElementNode && n.Data == "a" {
581                                         p.inBodyEndTagFormatting("a")
582                                         p.oe.remove(n)
583                                         p.afe.remove(n)
584                                         break
585                                 }
586                         }
587                         p.reconstructActiveFormattingElements()
588                         p.addFormattingElement(p.tok.Data, p.tok.Attr)
589                 case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
590                         p.reconstructActiveFormattingElements()
591                         p.addFormattingElement(p.tok.Data, p.tok.Attr)
592                 case "applet", "marquee", "object":
593                         p.reconstructActiveFormattingElements()
594                         p.addElement(p.tok.Data, p.tok.Attr)
595                         p.afe = append(p.afe, &scopeMarker)
596                         p.framesetOK = false
597                 case "area", "br", "embed", "img", "input", "keygen", "wbr":
598                         p.reconstructActiveFormattingElements()
599                         p.addElement(p.tok.Data, p.tok.Attr)
600                         p.oe.pop()
601                         p.acknowledgeSelfClosingTag()
602                         p.framesetOK = false
603                 case "table":
604                         p.popUntil(buttonScopeStopTags, "p") // TODO: skip this step in quirks mode.
605                         p.addElement(p.tok.Data, p.tok.Attr)
606                         p.framesetOK = false
607                         p.im = inTableIM
608                         return true
609                 case "hr":
610                         p.popUntil(buttonScopeStopTags, "p")
611                         p.addElement(p.tok.Data, p.tok.Attr)
612                         p.oe.pop()
613                         p.acknowledgeSelfClosingTag()
614                         p.framesetOK = false
615                 case "select":
616                         p.reconstructActiveFormattingElements()
617                         p.addElement(p.tok.Data, p.tok.Attr)
618                         p.framesetOK = false
619                         // TODO: detect <select> inside a table.
620                         p.im = inSelectIM
621                         return true
622                 case "form":
623                         if p.form == nil {
624                                 p.popUntil(buttonScopeStopTags, "p")
625                                 p.addElement(p.tok.Data, p.tok.Attr)
626                                 p.form = p.top()
627                         }
628                 case "li":
629                         p.framesetOK = false
630                         for i := len(p.oe) - 1; i >= 0; i-- {
631                                 node := p.oe[i]
632                                 switch node.Data {
633                                 case "li":
634                                         p.popUntil(listItemScopeStopTags, "li")
635                                 case "address", "div", "p":
636                                         continue
637                                 default:
638                                         if !isSpecialElement[node.Data] {
639                                                 continue
640                                         }
641                                 }
642                                 break
643                         }
644                         p.popUntil(buttonScopeStopTags, "p")
645                         p.addElement(p.tok.Data, p.tok.Attr)
646                 case "dd", "dt":
647                         p.framesetOK = false
648                         for i := len(p.oe) - 1; i >= 0; i-- {
649                                 node := p.oe[i]
650                                 switch node.Data {
651                                 case "dd", "dt":
652                                         p.oe = p.oe[:i]
653                                 case "address", "div", "p":
654                                         continue
655                                 default:
656                                         if !isSpecialElement[node.Data] {
657                                                 continue
658                                         }
659                                 }
660                                 break
661                         }
662                         p.popUntil(buttonScopeStopTags, "p")
663                         p.addElement(p.tok.Data, p.tok.Attr)
664                 case "plaintext":
665                         p.popUntil(buttonScopeStopTags, "p")
666                         p.addElement(p.tok.Data, p.tok.Attr)
667                 case "optgroup", "option":
668                         if p.top().Data == "option" {
669                                 p.oe.pop()
670                         }
671                         p.reconstructActiveFormattingElements()
672                         p.addElement(p.tok.Data, p.tok.Attr)
673                 case "body":
674                         if len(p.oe) >= 2 {
675                                 body := p.oe[1]
676                                 if body.Type == ElementNode && body.Data == "body" {
677                                         p.framesetOK = false
678                                         copyAttributes(body, p.tok)
679                                 }
680                         }
681                 case "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title":
682                         return inHeadIM(p)
683                 case "image":
684                         p.tok.Data = "img"
685                         return false
686                 case "isindex":
687                         if p.form != nil {
688                                 // Ignore the token.
689                                 return true
690                         }
691                         action := ""
692                         prompt := "This is a searchable index. Enter search keywords: "
693                         attr := []Attribute{{Key: "name", Val: "isindex"}}
694                         for _, a := range p.tok.Attr {
695                                 switch a.Key {
696                                 case "action":
697                                         action = a.Val
698                                 case "name":
699                                         // Ignore the attribute.
700                                 case "prompt":
701                                         prompt = a.Val
702                                 default:
703                                         attr = append(attr, a)
704                                 }
705                         }
706                         p.acknowledgeSelfClosingTag()
707                         p.popUntil(buttonScopeStopTags, "p")
708                         p.addElement("form", nil)
709                         p.form = p.top()
710                         if action != "" {
711                                 p.form.Attr = []Attribute{{Key: "action", Val: action}}
712                         }
713                         p.addElement("hr", nil)
714                         p.oe.pop()
715                         p.addElement("label", nil)
716                         p.addText(prompt)
717                         p.addElement("input", attr)
718                         p.oe.pop()
719                         p.oe.pop()
720                         p.addElement("hr", nil)
721                         p.oe.pop()
722                         p.oe.pop()
723                         p.form = nil
724                 case "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr":
725                         // Ignore the token.
726                 default:
727                         // TODO.
728                         p.addElement(p.tok.Data, p.tok.Attr)
729                 }
730         case EndTagToken:
731                 switch p.tok.Data {
732                 case "body":
733                         // TODO: autoclose the stack of open elements.
734                         p.im = afterBodyIM
735                         return true
736                 case "p":
737                         if !p.elementInScope(buttonScopeStopTags, "p") {
738                                 p.addElement("p", nil)
739                         }
740                         p.popUntil(buttonScopeStopTags, "p")
741                 case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
742                         p.inBodyEndTagFormatting(p.tok.Data)
743                 case "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul":
744                         p.popUntil(defaultScopeStopTags, p.tok.Data)
745                 case "applet", "marquee", "object":
746                         if p.popUntil(defaultScopeStopTags, p.tok.Data) {
747                                 p.clearActiveFormattingElements()
748                         }
749                 case "br":
750                         p.tok.Type = StartTagToken
751                         return false
752                 default:
753                         p.inBodyEndTagOther(p.tok.Data)
754                 }
755         case CommentToken:
756                 p.addChild(&Node{
757                         Type: CommentNode,
758                         Data: p.tok.Data,
759                 })
760         }
761
762         return true
763 }
764
765 func (p *parser) inBodyEndTagFormatting(tag string) {
766         // This is the "adoption agency" algorithm, described at
767         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
768
769         // TODO: this is a fairly literal line-by-line translation of that algorithm.
770         // Once the code successfully parses the comprehensive test suite, we should
771         // refactor this code to be more idiomatic.
772
773         // Steps 1-3. The outer loop.
774         for i := 0; i < 8; i++ {
775                 // Step 4. Find the formatting element.
776                 var formattingElement *Node
777                 for j := len(p.afe) - 1; j >= 0; j-- {
778                         if p.afe[j].Type == scopeMarkerNode {
779                                 break
780                         }
781                         if p.afe[j].Data == tag {
782                                 formattingElement = p.afe[j]
783                                 break
784                         }
785                 }
786                 if formattingElement == nil {
787                         p.inBodyEndTagOther(tag)
788                         return
789                 }
790                 feIndex := p.oe.index(formattingElement)
791                 if feIndex == -1 {
792                         p.afe.remove(formattingElement)
793                         return
794                 }
795                 if !p.elementInScope(defaultScopeStopTags, tag) {
796                         // Ignore the tag.
797                         return
798                 }
799
800                 // Steps 5-6. Find the furthest block.
801                 var furthestBlock *Node
802                 for _, e := range p.oe[feIndex:] {
803                         if isSpecialElement[e.Data] {
804                                 furthestBlock = e
805                                 break
806                         }
807                 }
808                 if furthestBlock == nil {
809                         e := p.oe.pop()
810                         for e != formattingElement {
811                                 e = p.oe.pop()
812                         }
813                         p.afe.remove(e)
814                         return
815                 }
816
817                 // Steps 7-8. Find the common ancestor and bookmark node.
818                 commonAncestor := p.oe[feIndex-1]
819                 bookmark := p.afe.index(formattingElement)
820
821                 // Step 9. The inner loop. Find the lastNode to reparent.
822                 lastNode := furthestBlock
823                 node := furthestBlock
824                 x := p.oe.index(node)
825                 // Steps 9.1-9.3.
826                 for j := 0; j < 3; j++ {
827                         // Step 9.4.
828                         x--
829                         node = p.oe[x]
830                         // Step 9.5.
831                         if p.afe.index(node) == -1 {
832                                 p.oe.remove(node)
833                                 continue
834                         }
835                         // Step 9.6.
836                         if node == formattingElement {
837                                 break
838                         }
839                         // Step 9.7.
840                         clone := node.clone()
841                         p.afe[p.afe.index(node)] = clone
842                         p.oe[p.oe.index(node)] = clone
843                         node = clone
844                         // Step 9.8.
845                         if lastNode == furthestBlock {
846                                 bookmark = p.afe.index(node) + 1
847                         }
848                         // Step 9.9.
849                         if lastNode.Parent != nil {
850                                 lastNode.Parent.Remove(lastNode)
851                         }
852                         node.Add(lastNode)
853                         // Step 9.10.
854                         lastNode = node
855                 }
856
857                 // Step 10. Reparent lastNode to the common ancestor,
858                 // or for misnested table nodes, to the foster parent.
859                 if lastNode.Parent != nil {
860                         lastNode.Parent.Remove(lastNode)
861                 }
862                 switch commonAncestor.Data {
863                 case "table", "tbody", "tfoot", "thead", "tr":
864                         p.fosterParent(lastNode)
865                 default:
866                         commonAncestor.Add(lastNode)
867                 }
868
869                 // Steps 11-13. Reparent nodes from the furthest block's children
870                 // to a clone of the formatting element.
871                 clone := formattingElement.clone()
872                 reparentChildren(clone, furthestBlock)
873                 furthestBlock.Add(clone)
874
875                 // Step 14. Fix up the list of active formatting elements.
876                 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
877                         // Move the bookmark with the rest of the list.
878                         bookmark--
879                 }
880                 p.afe.remove(formattingElement)
881                 p.afe.insert(bookmark, clone)
882
883                 // Step 15. Fix up the stack of open elements.
884                 p.oe.remove(formattingElement)
885                 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
886         }
887 }
888
889 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
890 func (p *parser) inBodyEndTagOther(tag string) {
891         for i := len(p.oe) - 1; i >= 0; i-- {
892                 if p.oe[i].Data == tag {
893                         p.oe = p.oe[:i]
894                         break
895                 }
896                 if isSpecialElement[p.oe[i].Data] {
897                         break
898                 }
899         }
900 }
901
902 // Section 11.2.5.4.8.
903 func textIM(p *parser) bool {
904         switch p.tok.Type {
905         case ErrorToken:
906                 p.oe.pop()
907         case TextToken:
908                 p.addText(p.tok.Data)
909                 return true
910         case EndTagToken:
911                 p.oe.pop()
912         }
913         p.im = p.originalIM
914         p.originalIM = nil
915         return p.tok.Type == EndTagToken
916 }
917
918 // Section 11.2.5.4.9.
919 func inTableIM(p *parser) bool {
920         switch p.tok.Type {
921         case ErrorToken:
922                 // Stop parsing.
923                 return true
924         case TextToken:
925                 // TODO.
926         case StartTagToken:
927                 switch p.tok.Data {
928                 case "caption":
929                         p.clearStackToContext(tableScopeStopTags)
930                         p.afe = append(p.afe, &scopeMarker)
931                         p.addElement(p.tok.Data, p.tok.Attr)
932                         p.im = inCaptionIM
933                         return true
934                 case "tbody", "tfoot", "thead":
935                         p.clearStackToContext(tableScopeStopTags)
936                         p.addElement(p.tok.Data, p.tok.Attr)
937                         p.im = inTableBodyIM
938                         return true
939                 case "td", "th", "tr":
940                         p.clearStackToContext(tableScopeStopTags)
941                         p.addElement("tbody", nil)
942                         p.im = inTableBodyIM
943                         return false
944                 case "table":
945                         if p.popUntil(tableScopeStopTags, "table") {
946                                 p.resetInsertionMode()
947                                 return false
948                         }
949                         // Ignore the token.
950                         return true
951                 case "colgroup":
952                         p.clearStackToContext(tableScopeStopTags)
953                         p.addElement(p.tok.Data, p.tok.Attr)
954                         p.im = inColumnGroupIM
955                         return true
956                 case "col":
957                         p.clearStackToContext(tableScopeStopTags)
958                         p.addElement("colgroup", p.tok.Attr)
959                         p.im = inColumnGroupIM
960                         return false
961                 default:
962                         // TODO.
963                 }
964         case EndTagToken:
965                 switch p.tok.Data {
966                 case "table":
967                         if p.popUntil(tableScopeStopTags, "table") {
968                                 p.resetInsertionMode()
969                                 return true
970                         }
971                         // Ignore the token.
972                         return true
973                 case "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr":
974                         // Ignore the token.
975                         return true
976                 }
977         case CommentToken:
978                 p.addChild(&Node{
979                         Type: CommentNode,
980                         Data: p.tok.Data,
981                 })
982                 return true
983         }
984
985         switch p.top().Data {
986         case "table", "tbody", "tfoot", "thead", "tr":
987                 p.fosterParenting = true
988                 defer func() { p.fosterParenting = false }()
989         }
990
991         return inBodyIM(p)
992 }
993
994 // clearStackToContext pops elements off the stack of open elements
995 // until an element listed in stopTags is found.
996 func (p *parser) clearStackToContext(stopTags []string) {
997         for i := len(p.oe) - 1; i >= 0; i-- {
998                 for _, tag := range stopTags {
999                         if p.oe[i].Data == tag {
1000                                 p.oe = p.oe[:i+1]
1001                                 return
1002                         }
1003                 }
1004         }
1005 }
1006
1007 // Section 11.2.5.4.11.
1008 func inCaptionIM(p *parser) bool {
1009         switch p.tok.Type {
1010         case StartTagToken:
1011                 switch p.tok.Data {
1012                 case "caption", "col", "colgroup", "tbody", "td", "tfoot", "thead", "tr":
1013                         if p.popUntil(tableScopeStopTags, "caption") {
1014                                 p.clearActiveFormattingElements()
1015                                 p.im = inTableIM
1016                                 return false
1017                         } else {
1018                                 // Ignore the token.
1019                                 return true
1020                         }
1021                 }
1022         case EndTagToken:
1023                 switch p.tok.Data {
1024                 case "caption":
1025                         if p.popUntil(tableScopeStopTags, "caption") {
1026                                 p.clearActiveFormattingElements()
1027                                 p.im = inTableIM
1028                         }
1029                         return true
1030                 case "table":
1031                         if p.popUntil(tableScopeStopTags, "caption") {
1032                                 p.clearActiveFormattingElements()
1033                                 p.im = inTableIM
1034                                 return false
1035                         } else {
1036                                 // Ignore the token.
1037                                 return true
1038                         }
1039                 case "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr":
1040                         // Ignore the token.
1041                         return true
1042                 }
1043         }
1044         return inBodyIM(p)
1045 }
1046
1047 // Section 11.2.5.4.12.
1048 func inColumnGroupIM(p *parser) bool {
1049         switch p.tok.Type {
1050         case CommentToken:
1051                 p.addChild(&Node{
1052                         Type: CommentNode,
1053                         Data: p.tok.Data,
1054                 })
1055                 return true
1056         case DoctypeToken:
1057                 // Ignore the token.
1058                 return true
1059         case StartTagToken:
1060                 switch p.tok.Data {
1061                 case "html":
1062                         return inBodyIM(p)
1063                 case "col":
1064                         p.addElement(p.tok.Data, p.tok.Attr)
1065                         p.oe.pop()
1066                         p.acknowledgeSelfClosingTag()
1067                         return true
1068                 }
1069         case EndTagToken:
1070                 switch p.tok.Data {
1071                 case "colgroup":
1072                         if p.oe.top().Data != "html" {
1073                                 p.oe.pop()
1074                         }
1075                         p.im = inTableIM
1076                         return true
1077                 case "col":
1078                         // Ignore the token.
1079                         return true
1080                 }
1081         }
1082         if p.oe.top().Data != "html" {
1083                 p.oe.pop()
1084         }
1085         p.im = inTableIM
1086         return false
1087 }
1088
1089 // Section 11.2.5.4.13.
1090 func inTableBodyIM(p *parser) bool {
1091         var (
1092                 add      bool
1093                 data     string
1094                 attr     []Attribute
1095                 consumed bool
1096         )
1097         switch p.tok.Type {
1098         case ErrorToken:
1099                 // TODO.
1100         case TextToken:
1101                 // TODO.
1102         case StartTagToken:
1103                 switch p.tok.Data {
1104                 case "tr":
1105                         add = true
1106                         data = p.tok.Data
1107                         attr = p.tok.Attr
1108                         consumed = true
1109                 case "td", "th":
1110                         add = true
1111                         data = "tr"
1112                         consumed = false
1113                 default:
1114                         // TODO.
1115                 }
1116         case EndTagToken:
1117                 switch p.tok.Data {
1118                 case "table":
1119                         if p.popUntil(tableScopeStopTags, "tbody", "thead", "tfoot") {
1120                                 p.im = inTableIM
1121                                 return false
1122                         }
1123                         // Ignore the token.
1124                         return true
1125                 case "body", "caption", "col", "colgroup", "html", "td", "th", "tr":
1126                         // Ignore the token.
1127                         return true
1128                 }
1129         case CommentToken:
1130                 p.addChild(&Node{
1131                         Type: CommentNode,
1132                         Data: p.tok.Data,
1133                 })
1134                 return true
1135         }
1136         if add {
1137                 // TODO: clear the stack back to a table body context.
1138                 p.addElement(data, attr)
1139                 p.im = inRowIM
1140                 return consumed
1141         }
1142         return inTableIM(p)
1143 }
1144
1145 // Section 11.2.5.4.14.
1146 func inRowIM(p *parser) bool {
1147         switch p.tok.Type {
1148         case ErrorToken:
1149                 // TODO.
1150         case TextToken:
1151                 // TODO.
1152         case StartTagToken:
1153                 switch p.tok.Data {
1154                 case "td", "th":
1155                         p.clearStackToContext(tableRowContextStopTags)
1156                         p.addElement(p.tok.Data, p.tok.Attr)
1157                         p.afe = append(p.afe, &scopeMarker)
1158                         p.im = inCellIM
1159                         return true
1160                 case "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr":
1161                         if p.popUntil(tableScopeStopTags, "tr") {
1162                                 p.im = inTableBodyIM
1163                                 return false
1164                         }
1165                         // Ignore the token.
1166                         return true
1167                 default:
1168                         // TODO.
1169                 }
1170         case EndTagToken:
1171                 switch p.tok.Data {
1172                 case "tr":
1173                         if p.popUntil(tableScopeStopTags, "tr") {
1174                                 p.im = inTableBodyIM
1175                                 return true
1176                         }
1177                         // Ignore the token.
1178                         return true
1179                 case "table":
1180                         if p.popUntil(tableScopeStopTags, "tr") {
1181                                 p.im = inTableBodyIM
1182                                 return false
1183                         }
1184                         // Ignore the token.
1185                         return true
1186                 case "tbody", "tfoot", "thead":
1187                         // TODO.
1188                 case "body", "caption", "col", "colgroup", "html", "td", "th":
1189                         // Ignore the token.
1190                         return true
1191                 default:
1192                         // TODO.
1193                 }
1194         case CommentToken:
1195                 p.addChild(&Node{
1196                         Type: CommentNode,
1197                         Data: p.tok.Data,
1198                 })
1199                 return true
1200         }
1201         return inTableIM(p)
1202 }
1203
1204 // Section 11.2.5.4.15.
1205 func inCellIM(p *parser) bool {
1206         var (
1207                 closeTheCellAndReprocess bool
1208         )
1209         switch p.tok.Type {
1210         case StartTagToken:
1211                 switch p.tok.Data {
1212                 case "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr":
1213                         // TODO: check for "td" or "th" in table scope.
1214                         closeTheCellAndReprocess = true
1215                 }
1216         case EndTagToken:
1217                 switch p.tok.Data {
1218                 case "td", "th":
1219                         if !p.popUntil(tableScopeStopTags, p.tok.Data) {
1220                                 // Ignore the token.
1221                                 return true
1222                         }
1223                         p.clearActiveFormattingElements()
1224                         p.im = inRowIM
1225                         return true
1226                 case "body", "caption", "col", "colgroup", "html":
1227                         // TODO.
1228                 case "table", "tbody", "tfoot", "thead", "tr":
1229                         // TODO: check for matching element in table scope.
1230                         closeTheCellAndReprocess = true
1231                 }
1232         case CommentToken:
1233                 p.addChild(&Node{
1234                         Type: CommentNode,
1235                         Data: p.tok.Data,
1236                 })
1237                 return true
1238         }
1239         if closeTheCellAndReprocess {
1240                 if p.popUntil(tableScopeStopTags, "td") || p.popUntil(tableScopeStopTags, "th") {
1241                         p.clearActiveFormattingElements()
1242                         p.im = inRowIM
1243                         return false
1244                 }
1245         }
1246         return inBodyIM(p)
1247 }
1248
1249 // Section 11.2.5.4.16.
1250 func inSelectIM(p *parser) bool {
1251         endSelect := false
1252         switch p.tok.Type {
1253         case ErrorToken:
1254                 // TODO.
1255         case TextToken:
1256                 p.addText(p.tok.Data)
1257         case StartTagToken:
1258                 switch p.tok.Data {
1259                 case "html":
1260                         // TODO.
1261                 case "option":
1262                         if p.top().Data == "option" {
1263                                 p.oe.pop()
1264                         }
1265                         p.addElement(p.tok.Data, p.tok.Attr)
1266                 case "optgroup":
1267                         if p.top().Data == "option" {
1268                                 p.oe.pop()
1269                         }
1270                         if p.top().Data == "optgroup" {
1271                                 p.oe.pop()
1272                         }
1273                         p.addElement(p.tok.Data, p.tok.Attr)
1274                 case "select":
1275                         endSelect = true
1276                 case "input", "keygen", "textarea":
1277                         // TODO.
1278                 case "script":
1279                         // TODO.
1280                 default:
1281                         // Ignore the token.
1282                 }
1283         case EndTagToken:
1284                 switch p.tok.Data {
1285                 case "option":
1286                         if p.top().Data == "option" {
1287                                 p.oe.pop()
1288                         }
1289                 case "optgroup":
1290                         i := len(p.oe) - 1
1291                         if p.oe[i].Data == "option" {
1292                                 i--
1293                         }
1294                         if p.oe[i].Data == "optgroup" {
1295                                 p.oe = p.oe[:i]
1296                         }
1297                 case "select":
1298                         endSelect = true
1299                 default:
1300                         // Ignore the token.
1301                 }
1302         case CommentToken:
1303                 p.doc.Add(&Node{
1304                         Type: CommentNode,
1305                         Data: p.tok.Data,
1306                 })
1307         }
1308         if endSelect {
1309                 for i := len(p.oe) - 1; i >= 0; i-- {
1310                         switch p.oe[i].Data {
1311                         case "select":
1312                                 p.oe = p.oe[:i]
1313                                 p.resetInsertionMode()
1314                                 return true
1315                         case "option", "optgroup":
1316                                 continue
1317                         default:
1318                                 // Ignore the token.
1319                                 return true
1320                         }
1321                 }
1322         }
1323         return true
1324 }
1325
1326 // Section 11.2.5.4.18.
1327 func afterBodyIM(p *parser) bool {
1328         switch p.tok.Type {
1329         case ErrorToken:
1330                 // Stop parsing.
1331                 return true
1332         case StartTagToken:
1333                 if p.tok.Data == "html" {
1334                         return inBodyIM(p)
1335                 }
1336         case EndTagToken:
1337                 if p.tok.Data == "html" {
1338                         p.im = afterAfterBodyIM
1339                         return true
1340                 }
1341         case CommentToken:
1342                 // The comment is attached to the <html> element.
1343                 if len(p.oe) < 1 || p.oe[0].Data != "html" {
1344                         panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1345                 }
1346                 p.oe[0].Add(&Node{
1347                         Type: CommentNode,
1348                         Data: p.tok.Data,
1349                 })
1350                 return true
1351         }
1352         p.im = inBodyIM
1353         return false
1354 }
1355
1356 // Section 11.2.5.4.19.
1357 func inFramesetIM(p *parser) bool {
1358         switch p.tok.Type {
1359         case CommentToken:
1360                 p.addChild(&Node{
1361                         Type: CommentNode,
1362                         Data: p.tok.Data,
1363                 })
1364         case StartTagToken:
1365                 switch p.tok.Data {
1366                 case "html":
1367                         return inBodyIM(p)
1368                 case "frameset":
1369                         p.addElement(p.tok.Data, p.tok.Attr)
1370                 case "frame":
1371                         p.addElement(p.tok.Data, p.tok.Attr)
1372                         p.oe.pop()
1373                         p.acknowledgeSelfClosingTag()
1374                 case "noframes":
1375                         return inHeadIM(p)
1376                 }
1377         case EndTagToken:
1378                 switch p.tok.Data {
1379                 case "frameset":
1380                         if p.oe.top().Data != "html" {
1381                                 p.oe.pop()
1382                                 if p.oe.top().Data != "frameset" {
1383                                         p.im = afterFramesetIM
1384                                         return true
1385                                 }
1386                         }
1387                 }
1388         default:
1389                 // Ignore the token.
1390         }
1391         return true
1392 }
1393
1394 // Section 11.2.5.4.20.
1395 func afterFramesetIM(p *parser) bool {
1396         switch p.tok.Type {
1397         case CommentToken:
1398                 p.addChild(&Node{
1399                         Type: CommentNode,
1400                         Data: p.tok.Data,
1401                 })
1402         case StartTagToken:
1403                 switch p.tok.Data {
1404                 case "html":
1405                         return inBodyIM(p)
1406                 case "noframes":
1407                         return inHeadIM(p)
1408                 }
1409         case EndTagToken:
1410                 switch p.tok.Data {
1411                 case "html":
1412                         p.im = afterAfterFramesetIM
1413                         return true
1414                 }
1415         default:
1416                 // Ignore the token.
1417         }
1418         return true
1419 }
1420
1421 // Section 11.2.5.4.21.
1422 func afterAfterBodyIM(p *parser) bool {
1423         switch p.tok.Type {
1424         case ErrorToken:
1425                 // Stop parsing.
1426                 return true
1427         case TextToken:
1428                 // TODO.
1429         case StartTagToken:
1430                 if p.tok.Data == "html" {
1431                         return inBodyIM(p)
1432                 }
1433         case CommentToken:
1434                 p.doc.Add(&Node{
1435                         Type: CommentNode,
1436                         Data: p.tok.Data,
1437                 })
1438                 return true
1439         }
1440         p.im = inBodyIM
1441         return false
1442 }
1443
1444 // Section 11.2.5.4.22.
1445 func afterAfterFramesetIM(p *parser) bool {
1446         switch p.tok.Type {
1447         case CommentToken:
1448                 p.addChild(&Node{
1449                         Type: CommentNode,
1450                         Data: p.tok.Data,
1451                 })
1452         case StartTagToken:
1453                 switch p.tok.Data {
1454                 case "html":
1455                         return inBodyIM(p)
1456                 case "noframes":
1457                         return inHeadIM(p)
1458                 }
1459         default:
1460                 // Ignore the token.
1461         }
1462         return true
1463 }
1464
1465 // Parse returns the parse tree for the HTML from the given Reader.
1466 // The input is assumed to be UTF-8 encoded.
1467 func Parse(r io.Reader) (*Node, error) {
1468         p := &parser{
1469                 tokenizer: NewTokenizer(r),
1470                 doc: &Node{
1471                         Type: DocumentNode,
1472                 },
1473                 scripting:  true,
1474                 framesetOK: true,
1475                 im:         initialIM,
1476         }
1477         // Iterate until EOF. Any other error will cause an early return.
1478         consumed := true
1479         for {
1480                 if consumed {
1481                         if err := p.read(); err != nil {
1482                                 if err == io.EOF {
1483                                         break
1484                                 }
1485                                 return nil, err
1486                         }
1487                 }
1488                 consumed = p.im(p)
1489         }
1490         // Loop until the final token (the ErrorToken signifying EOF) is consumed.
1491         for {
1492                 if consumed = p.im(p); consumed {
1493                         break
1494                 }
1495         }
1496         return p.doc, nil
1497 }