OSDN Git Service

4cb246969e54ca1c3b860b167d718bfd0376c4be
[pf3gnuchains/gcc-fork.git] / libgo / go / html / parse.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8         "io"
9         "strings"
10 )
11
12 // A parser implements the HTML5 parsing algorithm:
13 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
14 type parser struct {
15         // tokenizer provides the tokens for the parser.
16         tokenizer *Tokenizer
17         // tok is the most recently read token.
18         tok Token
19         // Self-closing tags like <hr/> are re-interpreted as a two-token sequence:
20         // <hr> followed by </hr>. hasSelfClosingToken is true if we have just read
21         // the synthetic start tag and the next one due is the matching end tag.
22         hasSelfClosingToken bool
23         // doc is the document root element.
24         doc *Node
25         // The stack of open elements (section 12.2.3.2) and active formatting
26         // elements (section 12.2.3.3).
27         oe, afe nodeStack
28         // Element pointers (section 12.2.3.4).
29         head, form *Node
30         // Other parsing state flags (section 12.2.3.5).
31         scripting, framesetOK bool
32         // im is the current insertion mode.
33         im insertionMode
34         // originalIM is the insertion mode to go back to after completing a text
35         // or inTableText insertion mode.
36         originalIM insertionMode
37         // fosterParenting is whether new elements should be inserted according to
38         // the foster parenting rules (section 12.2.5.3).
39         fosterParenting bool
40         // quirks is whether the parser is operating in "quirks mode."
41         quirks bool
42         // context is the context element when parsing an HTML fragment
43         // (section 12.4).
44         context *Node
45 }
46
47 func (p *parser) top() *Node {
48         if n := p.oe.top(); n != nil {
49                 return n
50         }
51         return p.doc
52 }
53
54 // stopTags for use in popUntil. These come from section 12.2.3.2.
55 var (
56         defaultScopeStopTags  = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object"}
57         listItemScopeStopTags = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "ol", "ul"}
58         buttonScopeStopTags   = []string{"applet", "caption", "html", "table", "td", "th", "marquee", "object", "button"}
59         tableScopeStopTags    = []string{"html", "table"}
60 )
61
62 // stopTags for use in clearStackToContext.
63 var (
64         tableRowContextStopTags = []string{"tr", "html"}
65 )
66
67 // popUntil pops the stack of open elements at the highest element whose tag
68 // is in matchTags, provided there is no higher element in stopTags. It returns
69 // whether or not there was such an element. If there was not, popUntil leaves
70 // the stack unchanged.
71 //
72 // For example, if the stack was:
73 // ["html", "body", "font", "table", "b", "i", "u"]
74 // then popUntil([]string{"html, "table"}, "font") would return false, but
75 // popUntil([]string{"html, "table"}, "i") would return true and the resultant
76 // stack would be:
77 // ["html", "body", "font", "table", "b"]
78 //
79 // If an element's tag is in both stopTags and matchTags, then the stack will
80 // be popped and the function returns true (provided, of course, there was no
81 // higher element in the stack that was also in stopTags). For example,
82 // popUntil([]string{"html, "table"}, "table") would return true and leave:
83 // ["html", "body", "font"]
84 func (p *parser) popUntil(stopTags []string, matchTags ...string) bool {
85         if i := p.indexOfElementInScope(stopTags, matchTags...); i != -1 {
86                 p.oe = p.oe[:i]
87                 return true
88         }
89         return false
90 }
91
92 // indexOfElementInScope returns the index in p.oe of the highest element
93 // whose tag is in matchTags that is in scope according to stopTags.
94 // If no matching element is in scope, it returns -1.
95 func (p *parser) indexOfElementInScope(stopTags []string, matchTags ...string) int {
96         for i := len(p.oe) - 1; i >= 0; i-- {
97                 tag := p.oe[i].Data
98                 for _, t := range matchTags {
99                         if t == tag {
100                                 return i
101                         }
102                 }
103                 for _, t := range stopTags {
104                         if t == tag {
105                                 return -1
106                         }
107                 }
108         }
109         return -1
110 }
111
112 // elementInScope is like popUntil, except that it doesn't modify the stack of
113 // open elements.
114 func (p *parser) elementInScope(stopTags []string, matchTags ...string) bool {
115         return p.indexOfElementInScope(stopTags, matchTags...) != -1
116 }
117
118 // addChild adds a child node n to the top element, and pushes n onto the stack
119 // of open elements if it is an element node.
120 func (p *parser) addChild(n *Node) {
121         if p.fosterParenting {
122                 p.fosterParent(n)
123         } else {
124                 p.top().Add(n)
125         }
126
127         if n.Type == ElementNode {
128                 p.oe = append(p.oe, n)
129         }
130 }
131
132 // fosterParent adds a child node according to the foster parenting rules.
133 // Section 12.2.5.3, "foster parenting".
134 func (p *parser) fosterParent(n *Node) {
135         p.fosterParenting = false
136         var table, parent *Node
137         var i int
138         for i = len(p.oe) - 1; i >= 0; i-- {
139                 if p.oe[i].Data == "table" {
140                         table = p.oe[i]
141                         break
142                 }
143         }
144
145         if table == nil {
146                 // The foster parent is the html element.
147                 parent = p.oe[0]
148         } else {
149                 parent = table.Parent
150         }
151         if parent == nil {
152                 parent = p.oe[i-1]
153         }
154
155         var child *Node
156         for i, child = range parent.Child {
157                 if child == table {
158                         break
159                 }
160         }
161
162         if i > 0 && parent.Child[i-1].Type == TextNode && n.Type == TextNode {
163                 parent.Child[i-1].Data += n.Data
164                 return
165         }
166
167         if i == len(parent.Child) {
168                 parent.Add(n)
169         } else {
170                 // Insert n into parent.Child at index i.
171                 parent.Child = append(parent.Child[:i+1], parent.Child[i:]...)
172                 parent.Child[i] = n
173                 n.Parent = parent
174         }
175 }
176
177 // addText adds text to the preceding node if it is a text node, or else it
178 // calls addChild with a new text node.
179 func (p *parser) addText(text string) {
180         // TODO: distinguish whitespace text from others.
181         t := p.top()
182         if i := len(t.Child); i > 0 && t.Child[i-1].Type == TextNode {
183                 t.Child[i-1].Data += text
184                 return
185         }
186         p.addChild(&Node{
187                 Type: TextNode,
188                 Data: text,
189         })
190 }
191
192 // addElement calls addChild with an element node.
193 func (p *parser) addElement(tag string, attr []Attribute) {
194         p.addChild(&Node{
195                 Type:      ElementNode,
196                 Data:      tag,
197                 Namespace: p.top().Namespace,
198                 Attr:      attr,
199         })
200 }
201
202 // Section 12.2.3.3.
203 func (p *parser) addFormattingElement(tag string, attr []Attribute) {
204         p.addElement(tag, attr)
205         p.afe = append(p.afe, p.top())
206         // TODO.
207 }
208
209 // Section 12.2.3.3.
210 func (p *parser) clearActiveFormattingElements() {
211         for {
212                 n := p.afe.pop()
213                 if len(p.afe) == 0 || n.Type == scopeMarkerNode {
214                         return
215                 }
216         }
217 }
218
219 // Section 12.2.3.3.
220 func (p *parser) reconstructActiveFormattingElements() {
221         n := p.afe.top()
222         if n == nil {
223                 return
224         }
225         if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
226                 return
227         }
228         i := len(p.afe) - 1
229         for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
230                 if i == 0 {
231                         i = -1
232                         break
233                 }
234                 i--
235                 n = p.afe[i]
236         }
237         for {
238                 i++
239                 clone := p.afe[i].clone()
240                 p.addChild(clone)
241                 p.afe[i] = clone
242                 if i == len(p.afe)-1 {
243                         break
244                 }
245         }
246 }
247
248 // read reads the next token. This is usually from the tokenizer, but it may
249 // be the synthesized end tag implied by a self-closing tag.
250 func (p *parser) read() error {
251         if p.hasSelfClosingToken {
252                 p.hasSelfClosingToken = false
253                 p.tok.Type = EndTagToken
254                 p.tok.Attr = nil
255                 return nil
256         }
257         p.tokenizer.Next()
258         p.tok = p.tokenizer.Token()
259         switch p.tok.Type {
260         case ErrorToken:
261                 return p.tokenizer.Err()
262         case SelfClosingTagToken:
263                 p.hasSelfClosingToken = true
264                 p.tok.Type = StartTagToken
265         }
266         return nil
267 }
268
269 // Section 12.2.4.
270 func (p *parser) acknowledgeSelfClosingTag() {
271         p.hasSelfClosingToken = false
272 }
273
274 // An insertion mode (section 12.2.3.1) is the state transition function from
275 // a particular state in the HTML5 parser's state machine. It updates the
276 // parser's fields depending on parser.tok (where ErrorToken means EOF).
277 // It returns whether the token was consumed.
278 type insertionMode func(*parser) bool
279
280 // setOriginalIM sets the insertion mode to return to after completing a text or
281 // inTableText insertion mode.
282 // Section 12.2.3.1, "using the rules for".
283 func (p *parser) setOriginalIM() {
284         if p.originalIM != nil {
285                 panic("html: bad parser state: originalIM was set twice")
286         }
287         p.originalIM = p.im
288 }
289
290 // Section 12.2.3.1, "reset the insertion mode".
291 func (p *parser) resetInsertionMode() {
292         for i := len(p.oe) - 1; i >= 0; i-- {
293                 n := p.oe[i]
294                 if i == 0 && p.context != nil {
295                         n = p.context
296                 }
297
298                 switch n.Data {
299                 case "select":
300                         p.im = inSelectIM
301                 case "td", "th":
302                         p.im = inCellIM
303                 case "tr":
304                         p.im = inRowIM
305                 case "tbody", "thead", "tfoot":
306                         p.im = inTableBodyIM
307                 case "caption":
308                         p.im = inCaptionIM
309                 case "colgroup":
310                         p.im = inColumnGroupIM
311                 case "table":
312                         p.im = inTableIM
313                 case "head":
314                         p.im = inBodyIM
315                 case "body":
316                         p.im = inBodyIM
317                 case "frameset":
318                         p.im = inFramesetIM
319                 case "html":
320                         p.im = beforeHeadIM
321                 default:
322                         if p.top().Namespace == "" {
323                                 continue
324                         }
325                         p.im = inForeignContentIM
326                 }
327                 return
328         }
329         p.im = inBodyIM
330 }
331
332 const whitespace = " \t\r\n\f"
333
334 // Section 12.2.5.4.1.
335 func initialIM(p *parser) bool {
336         switch p.tok.Type {
337         case TextToken:
338                 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
339                 if len(p.tok.Data) == 0 {
340                         // It was all whitespace, so ignore it.
341                         return true
342                 }
343         case CommentToken:
344                 p.doc.Add(&Node{
345                         Type: CommentNode,
346                         Data: p.tok.Data,
347                 })
348                 return true
349         case DoctypeToken:
350                 n, quirks := parseDoctype(p.tok.Data)
351                 p.doc.Add(n)
352                 p.quirks = quirks
353                 p.im = beforeHTMLIM
354                 return true
355         }
356         p.quirks = true
357         p.im = beforeHTMLIM
358         return false
359 }
360
361 // Section 12.2.5.4.2.
362 func beforeHTMLIM(p *parser) bool {
363         switch p.tok.Type {
364         case TextToken:
365                 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
366                 if len(p.tok.Data) == 0 {
367                         // It was all whitespace, so ignore it.
368                         return true
369                 }
370         case StartTagToken:
371                 if p.tok.Data == "html" {
372                         p.addElement(p.tok.Data, p.tok.Attr)
373                         p.im = beforeHeadIM
374                         return true
375                 }
376         case EndTagToken:
377                 switch p.tok.Data {
378                 case "head", "body", "html", "br":
379                         // Drop down to creating an implied <html> tag.
380                 default:
381                         // Ignore the token.
382                         return true
383                 }
384         case CommentToken:
385                 p.doc.Add(&Node{
386                         Type: CommentNode,
387                         Data: p.tok.Data,
388                 })
389                 return true
390         }
391         // Create an implied <html> tag.
392         p.addElement("html", nil)
393         p.im = beforeHeadIM
394         return false
395 }
396
397 // Section 12.2.5.4.3.
398 func beforeHeadIM(p *parser) bool {
399         var (
400                 add     bool
401                 attr    []Attribute
402                 implied bool
403         )
404         switch p.tok.Type {
405         case ErrorToken:
406                 implied = true
407         case TextToken:
408                 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
409                 if len(p.tok.Data) == 0 {
410                         // It was all whitespace, so ignore it.
411                         return true
412                 }
413                 implied = true
414         case StartTagToken:
415                 switch p.tok.Data {
416                 case "head":
417                         add = true
418                         attr = p.tok.Attr
419                 case "html":
420                         return inBodyIM(p)
421                 default:
422                         implied = true
423                 }
424         case EndTagToken:
425                 switch p.tok.Data {
426                 case "head", "body", "html", "br":
427                         implied = true
428                 default:
429                         // Ignore the token.
430                 }
431         case CommentToken:
432                 p.addChild(&Node{
433                         Type: CommentNode,
434                         Data: p.tok.Data,
435                 })
436                 return true
437         }
438         if add || implied {
439                 p.addElement("head", attr)
440                 p.head = p.top()
441         }
442         p.im = inHeadIM
443         return !implied
444 }
445
446 // Section 12.2.5.4.4.
447 func inHeadIM(p *parser) bool {
448         var (
449                 pop     bool
450                 implied bool
451         )
452         switch p.tok.Type {
453         case ErrorToken:
454                 implied = true
455         case TextToken:
456                 s := strings.TrimLeft(p.tok.Data, whitespace)
457                 if len(s) < len(p.tok.Data) {
458                         // Add the initial whitespace to the current node.
459                         p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
460                         if s == "" {
461                                 return true
462                         }
463                         p.tok.Data = s
464                 }
465                 implied = true
466         case StartTagToken:
467                 switch p.tok.Data {
468                 case "html":
469                         return inBodyIM(p)
470                 case "base", "basefont", "bgsound", "command", "link", "meta":
471                         p.addElement(p.tok.Data, p.tok.Attr)
472                         p.oe.pop()
473                         p.acknowledgeSelfClosingTag()
474                 case "script", "title", "noscript", "noframes", "style":
475                         p.addElement(p.tok.Data, p.tok.Attr)
476                         p.setOriginalIM()
477                         p.im = textIM
478                         return true
479                 case "head":
480                         // Ignore the token.
481                         return true
482                 default:
483                         implied = true
484                 }
485         case EndTagToken:
486                 switch p.tok.Data {
487                 case "head":
488                         pop = true
489                 case "body", "html", "br":
490                         implied = true
491                 default:
492                         // Ignore the token.
493                         return true
494                 }
495         case CommentToken:
496                 p.addChild(&Node{
497                         Type: CommentNode,
498                         Data: p.tok.Data,
499                 })
500                 return true
501         }
502         if pop || implied {
503                 n := p.oe.pop()
504                 if n.Data != "head" {
505                         panic("html: bad parser state: <head> element not found, in the in-head insertion mode")
506                 }
507                 p.im = afterHeadIM
508                 return !implied
509         }
510         return true
511 }
512
513 // Section 12.2.5.4.6.
514 func afterHeadIM(p *parser) bool {
515         var (
516                 add        bool
517                 attr       []Attribute
518                 framesetOK bool
519                 implied    bool
520         )
521         switch p.tok.Type {
522         case ErrorToken:
523                 implied = true
524                 framesetOK = true
525         case TextToken:
526                 s := strings.TrimLeft(p.tok.Data, whitespace)
527                 if len(s) < len(p.tok.Data) {
528                         // Add the initial whitespace to the current node.
529                         p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
530                         if s == "" {
531                                 return true
532                         }
533                         p.tok.Data = s
534                 }
535                 implied = true
536                 framesetOK = true
537         case StartTagToken:
538                 switch p.tok.Data {
539                 case "html":
540                         // TODO.
541                 case "body":
542                         add = true
543                         attr = p.tok.Attr
544                         framesetOK = false
545                 case "frameset":
546                         p.addElement(p.tok.Data, p.tok.Attr)
547                         p.im = inFramesetIM
548                         return true
549                 case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
550                         p.oe = append(p.oe, p.head)
551                         defer p.oe.pop()
552                         return inHeadIM(p)
553                 case "head":
554                         // Ignore the token.
555                         return true
556                 default:
557                         implied = true
558                         framesetOK = true
559                 }
560         case EndTagToken:
561                 switch p.tok.Data {
562                 case "body", "html", "br":
563                         implied = true
564                         framesetOK = true
565                 default:
566                         // Ignore the token.
567                         return true
568                 }
569         case CommentToken:
570                 p.addChild(&Node{
571                         Type: CommentNode,
572                         Data: p.tok.Data,
573                 })
574                 return true
575         }
576         if add || implied {
577                 p.addElement("body", attr)
578                 p.framesetOK = framesetOK
579         }
580         p.im = inBodyIM
581         return !implied
582 }
583
584 // copyAttributes copies attributes of src not found on dst to dst.
585 func copyAttributes(dst *Node, src Token) {
586         if len(src.Attr) == 0 {
587                 return
588         }
589         attr := map[string]string{}
590         for _, a := range dst.Attr {
591                 attr[a.Key] = a.Val
592         }
593         for _, a := range src.Attr {
594                 if _, ok := attr[a.Key]; !ok {
595                         dst.Attr = append(dst.Attr, a)
596                         attr[a.Key] = a.Val
597                 }
598         }
599 }
600
601 // Section 12.2.5.4.7.
602 func inBodyIM(p *parser) bool {
603         switch p.tok.Type {
604         case TextToken:
605                 switch n := p.oe.top(); n.Data {
606                 case "pre", "listing", "textarea":
607                         if len(n.Child) == 0 {
608                                 // Ignore a newline at the start of a <pre> block.
609                                 d := p.tok.Data
610                                 if d != "" && d[0] == '\r' {
611                                         d = d[1:]
612                                 }
613                                 if d != "" && d[0] == '\n' {
614                                         d = d[1:]
615                                 }
616                                 if d == "" {
617                                         return true
618                                 }
619                                 p.tok.Data = d
620                         }
621                 }
622                 p.reconstructActiveFormattingElements()
623                 p.addText(p.tok.Data)
624                 p.framesetOK = false
625         case StartTagToken:
626                 switch p.tok.Data {
627                 case "html":
628                         copyAttributes(p.oe[0], p.tok)
629                 case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
630                         p.popUntil(buttonScopeStopTags, "p")
631                         p.addElement(p.tok.Data, p.tok.Attr)
632                 case "h1", "h2", "h3", "h4", "h5", "h6":
633                         p.popUntil(buttonScopeStopTags, "p")
634                         switch n := p.top(); n.Data {
635                         case "h1", "h2", "h3", "h4", "h5", "h6":
636                                 p.oe.pop()
637                         }
638                         p.addElement(p.tok.Data, p.tok.Attr)
639                 case "a":
640                         for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
641                                 if n := p.afe[i]; n.Type == ElementNode && n.Data == "a" {
642                                         p.inBodyEndTagFormatting("a")
643                                         p.oe.remove(n)
644                                         p.afe.remove(n)
645                                         break
646                                 }
647                         }
648                         p.reconstructActiveFormattingElements()
649                         p.addFormattingElement(p.tok.Data, p.tok.Attr)
650                 case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
651                         p.reconstructActiveFormattingElements()
652                         p.addFormattingElement(p.tok.Data, p.tok.Attr)
653                 case "nobr":
654                         p.reconstructActiveFormattingElements()
655                         if p.elementInScope(defaultScopeStopTags, "nobr") {
656                                 p.inBodyEndTagFormatting("nobr")
657                                 p.reconstructActiveFormattingElements()
658                         }
659                         p.addFormattingElement(p.tok.Data, p.tok.Attr)
660                 case "applet", "marquee", "object":
661                         p.reconstructActiveFormattingElements()
662                         p.addElement(p.tok.Data, p.tok.Attr)
663                         p.afe = append(p.afe, &scopeMarker)
664                         p.framesetOK = false
665                 case "area", "br", "embed", "img", "input", "keygen", "wbr":
666                         p.reconstructActiveFormattingElements()
667                         p.addElement(p.tok.Data, p.tok.Attr)
668                         p.oe.pop()
669                         p.acknowledgeSelfClosingTag()
670                         p.framesetOK = false
671                 case "table":
672                         if !p.quirks {
673                                 p.popUntil(buttonScopeStopTags, "p")
674                         }
675                         p.addElement(p.tok.Data, p.tok.Attr)
676                         p.framesetOK = false
677                         p.im = inTableIM
678                         return true
679                 case "hr":
680                         p.popUntil(buttonScopeStopTags, "p")
681                         p.addElement(p.tok.Data, p.tok.Attr)
682                         p.oe.pop()
683                         p.acknowledgeSelfClosingTag()
684                         p.framesetOK = false
685                 case "select":
686                         p.reconstructActiveFormattingElements()
687                         p.addElement(p.tok.Data, p.tok.Attr)
688                         p.framesetOK = false
689                         // TODO: detect <select> inside a table.
690                         p.im = inSelectIM
691                         return true
692                 case "form":
693                         if p.form == nil {
694                                 p.popUntil(buttonScopeStopTags, "p")
695                                 p.addElement(p.tok.Data, p.tok.Attr)
696                                 p.form = p.top()
697                         }
698                 case "li":
699                         p.framesetOK = false
700                         for i := len(p.oe) - 1; i >= 0; i-- {
701                                 node := p.oe[i]
702                                 switch node.Data {
703                                 case "li":
704                                         p.popUntil(listItemScopeStopTags, "li")
705                                 case "address", "div", "p":
706                                         continue
707                                 default:
708                                         if !isSpecialElement[node.Data] {
709                                                 continue
710                                         }
711                                 }
712                                 break
713                         }
714                         p.popUntil(buttonScopeStopTags, "p")
715                         p.addElement(p.tok.Data, p.tok.Attr)
716                 case "dd", "dt":
717                         p.framesetOK = false
718                         for i := len(p.oe) - 1; i >= 0; i-- {
719                                 node := p.oe[i]
720                                 switch node.Data {
721                                 case "dd", "dt":
722                                         p.oe = p.oe[:i]
723                                 case "address", "div", "p":
724                                         continue
725                                 default:
726                                         if !isSpecialElement[node.Data] {
727                                                 continue
728                                         }
729                                 }
730                                 break
731                         }
732                         p.popUntil(buttonScopeStopTags, "p")
733                         p.addElement(p.tok.Data, p.tok.Attr)
734                 case "plaintext":
735                         p.popUntil(buttonScopeStopTags, "p")
736                         p.addElement(p.tok.Data, p.tok.Attr)
737                 case "button":
738                         p.popUntil(defaultScopeStopTags, "button")
739                         p.reconstructActiveFormattingElements()
740                         p.addElement(p.tok.Data, p.tok.Attr)
741                         p.framesetOK = false
742                 case "optgroup", "option":
743                         if p.top().Data == "option" {
744                                 p.oe.pop()
745                         }
746                         p.reconstructActiveFormattingElements()
747                         p.addElement(p.tok.Data, p.tok.Attr)
748                 case "body":
749                         if len(p.oe) >= 2 {
750                                 body := p.oe[1]
751                                 if body.Type == ElementNode && body.Data == "body" {
752                                         p.framesetOK = false
753                                         copyAttributes(body, p.tok)
754                                 }
755                         }
756                 case "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title":
757                         return inHeadIM(p)
758                 case "image":
759                         p.tok.Data = "img"
760                         return false
761                 case "isindex":
762                         if p.form != nil {
763                                 // Ignore the token.
764                                 return true
765                         }
766                         action := ""
767                         prompt := "This is a searchable index. Enter search keywords: "
768                         attr := []Attribute{{Key: "name", Val: "isindex"}}
769                         for _, a := range p.tok.Attr {
770                                 switch a.Key {
771                                 case "action":
772                                         action = a.Val
773                                 case "name":
774                                         // Ignore the attribute.
775                                 case "prompt":
776                                         prompt = a.Val
777                                 default:
778                                         attr = append(attr, a)
779                                 }
780                         }
781                         p.acknowledgeSelfClosingTag()
782                         p.popUntil(buttonScopeStopTags, "p")
783                         p.addElement("form", nil)
784                         p.form = p.top()
785                         if action != "" {
786                                 p.form.Attr = []Attribute{{Key: "action", Val: action}}
787                         }
788                         p.addElement("hr", nil)
789                         p.oe.pop()
790                         p.addElement("label", nil)
791                         p.addText(prompt)
792                         p.addElement("input", attr)
793                         p.oe.pop()
794                         p.oe.pop()
795                         p.addElement("hr", nil)
796                         p.oe.pop()
797                         p.oe.pop()
798                         p.form = nil
799                 case "xmp":
800                         p.popUntil(buttonScopeStopTags, "p")
801                         p.reconstructActiveFormattingElements()
802                         p.framesetOK = false
803                         p.addElement(p.tok.Data, p.tok.Attr)
804                 case "math", "svg":
805                         p.reconstructActiveFormattingElements()
806                         namespace := ""
807                         if p.tok.Data == "math" {
808                                 // TODO: adjust MathML attributes.
809                                 namespace = "mathml"
810                         } else {
811                                 // TODO: adjust SVG attributes.
812                                 namespace = "svg"
813                         }
814                         // TODO: adjust foreign attributes.
815                         p.addElement(p.tok.Data, p.tok.Attr)
816                         p.top().Namespace = namespace
817                         p.im = inForeignContentIM
818                         return true
819                 case "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr":
820                         // Ignore the token.
821                 default:
822                         // TODO.
823                         p.addElement(p.tok.Data, p.tok.Attr)
824                 }
825         case EndTagToken:
826                 switch p.tok.Data {
827                 case "body":
828                         // TODO: autoclose the stack of open elements.
829                         p.im = afterBodyIM
830                         return true
831                 case "p":
832                         if !p.elementInScope(buttonScopeStopTags, "p") {
833                                 p.addElement("p", nil)
834                         }
835                         p.popUntil(buttonScopeStopTags, "p")
836                 case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
837                         p.inBodyEndTagFormatting(p.tok.Data)
838                 case "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul":
839                         p.popUntil(defaultScopeStopTags, p.tok.Data)
840                 case "applet", "marquee", "object":
841                         if p.popUntil(defaultScopeStopTags, p.tok.Data) {
842                                 p.clearActiveFormattingElements()
843                         }
844                 case "br":
845                         p.tok.Type = StartTagToken
846                         return false
847                 default:
848                         p.inBodyEndTagOther(p.tok.Data)
849                 }
850         case CommentToken:
851                 p.addChild(&Node{
852                         Type: CommentNode,
853                         Data: p.tok.Data,
854                 })
855         }
856
857         return true
858 }
859
860 func (p *parser) inBodyEndTagFormatting(tag string) {
861         // This is the "adoption agency" algorithm, described at
862         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
863
864         // TODO: this is a fairly literal line-by-line translation of that algorithm.
865         // Once the code successfully parses the comprehensive test suite, we should
866         // refactor this code to be more idiomatic.
867
868         // Steps 1-3. The outer loop.
869         for i := 0; i < 8; i++ {
870                 // Step 4. Find the formatting element.
871                 var formattingElement *Node
872                 for j := len(p.afe) - 1; j >= 0; j-- {
873                         if p.afe[j].Type == scopeMarkerNode {
874                                 break
875                         }
876                         if p.afe[j].Data == tag {
877                                 formattingElement = p.afe[j]
878                                 break
879                         }
880                 }
881                 if formattingElement == nil {
882                         p.inBodyEndTagOther(tag)
883                         return
884                 }
885                 feIndex := p.oe.index(formattingElement)
886                 if feIndex == -1 {
887                         p.afe.remove(formattingElement)
888                         return
889                 }
890                 if !p.elementInScope(defaultScopeStopTags, tag) {
891                         // Ignore the tag.
892                         return
893                 }
894
895                 // Steps 5-6. Find the furthest block.
896                 var furthestBlock *Node
897                 for _, e := range p.oe[feIndex:] {
898                         if isSpecialElement[e.Data] {
899                                 furthestBlock = e
900                                 break
901                         }
902                 }
903                 if furthestBlock == nil {
904                         e := p.oe.pop()
905                         for e != formattingElement {
906                                 e = p.oe.pop()
907                         }
908                         p.afe.remove(e)
909                         return
910                 }
911
912                 // Steps 7-8. Find the common ancestor and bookmark node.
913                 commonAncestor := p.oe[feIndex-1]
914                 bookmark := p.afe.index(formattingElement)
915
916                 // Step 9. The inner loop. Find the lastNode to reparent.
917                 lastNode := furthestBlock
918                 node := furthestBlock
919                 x := p.oe.index(node)
920                 // Steps 9.1-9.3.
921                 for j := 0; j < 3; j++ {
922                         // Step 9.4.
923                         x--
924                         node = p.oe[x]
925                         // Step 9.5.
926                         if p.afe.index(node) == -1 {
927                                 p.oe.remove(node)
928                                 continue
929                         }
930                         // Step 9.6.
931                         if node == formattingElement {
932                                 break
933                         }
934                         // Step 9.7.
935                         clone := node.clone()
936                         p.afe[p.afe.index(node)] = clone
937                         p.oe[p.oe.index(node)] = clone
938                         node = clone
939                         // Step 9.8.
940                         if lastNode == furthestBlock {
941                                 bookmark = p.afe.index(node) + 1
942                         }
943                         // Step 9.9.
944                         if lastNode.Parent != nil {
945                                 lastNode.Parent.Remove(lastNode)
946                         }
947                         node.Add(lastNode)
948                         // Step 9.10.
949                         lastNode = node
950                 }
951
952                 // Step 10. Reparent lastNode to the common ancestor,
953                 // or for misnested table nodes, to the foster parent.
954                 if lastNode.Parent != nil {
955                         lastNode.Parent.Remove(lastNode)
956                 }
957                 switch commonAncestor.Data {
958                 case "table", "tbody", "tfoot", "thead", "tr":
959                         p.fosterParent(lastNode)
960                 default:
961                         commonAncestor.Add(lastNode)
962                 }
963
964                 // Steps 11-13. Reparent nodes from the furthest block's children
965                 // to a clone of the formatting element.
966                 clone := formattingElement.clone()
967                 reparentChildren(clone, furthestBlock)
968                 furthestBlock.Add(clone)
969
970                 // Step 14. Fix up the list of active formatting elements.
971                 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
972                         // Move the bookmark with the rest of the list.
973                         bookmark--
974                 }
975                 p.afe.remove(formattingElement)
976                 p.afe.insert(bookmark, clone)
977
978                 // Step 15. Fix up the stack of open elements.
979                 p.oe.remove(formattingElement)
980                 p.oe.insert(p.oe.index(furthestBlock)+1, clone)
981         }
982 }
983
984 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
985 func (p *parser) inBodyEndTagOther(tag string) {
986         for i := len(p.oe) - 1; i >= 0; i-- {
987                 if p.oe[i].Data == tag {
988                         p.oe = p.oe[:i]
989                         break
990                 }
991                 if isSpecialElement[p.oe[i].Data] {
992                         break
993                 }
994         }
995 }
996
997 // Section 12.2.5.4.8.
998 func textIM(p *parser) bool {
999         switch p.tok.Type {
1000         case ErrorToken:
1001                 p.oe.pop()
1002         case TextToken:
1003                 p.addText(p.tok.Data)
1004                 return true
1005         case EndTagToken:
1006                 p.oe.pop()
1007         }
1008         p.im = p.originalIM
1009         p.originalIM = nil
1010         return p.tok.Type == EndTagToken
1011 }
1012
1013 // Section 12.2.5.4.9.
1014 func inTableIM(p *parser) bool {
1015         switch p.tok.Type {
1016         case ErrorToken:
1017                 // Stop parsing.
1018                 return true
1019         case TextToken:
1020                 // TODO.
1021         case StartTagToken:
1022                 switch p.tok.Data {
1023                 case "caption":
1024                         p.clearStackToContext(tableScopeStopTags)
1025                         p.afe = append(p.afe, &scopeMarker)
1026                         p.addElement(p.tok.Data, p.tok.Attr)
1027                         p.im = inCaptionIM
1028                         return true
1029                 case "tbody", "tfoot", "thead":
1030                         p.clearStackToContext(tableScopeStopTags)
1031                         p.addElement(p.tok.Data, p.tok.Attr)
1032                         p.im = inTableBodyIM
1033                         return true
1034                 case "td", "th", "tr":
1035                         p.clearStackToContext(tableScopeStopTags)
1036                         p.addElement("tbody", nil)
1037                         p.im = inTableBodyIM
1038                         return false
1039                 case "table":
1040                         if p.popUntil(tableScopeStopTags, "table") {
1041                                 p.resetInsertionMode()
1042                                 return false
1043                         }
1044                         // Ignore the token.
1045                         return true
1046                 case "colgroup":
1047                         p.clearStackToContext(tableScopeStopTags)
1048                         p.addElement(p.tok.Data, p.tok.Attr)
1049                         p.im = inColumnGroupIM
1050                         return true
1051                 case "col":
1052                         p.clearStackToContext(tableScopeStopTags)
1053                         p.addElement("colgroup", p.tok.Attr)
1054                         p.im = inColumnGroupIM
1055                         return false
1056                 default:
1057                         // TODO.
1058                 }
1059         case EndTagToken:
1060                 switch p.tok.Data {
1061                 case "table":
1062                         if p.popUntil(tableScopeStopTags, "table") {
1063                                 p.resetInsertionMode()
1064                                 return true
1065                         }
1066                         // Ignore the token.
1067                         return true
1068                 case "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr":
1069                         // Ignore the token.
1070                         return true
1071                 }
1072         case CommentToken:
1073                 p.addChild(&Node{
1074                         Type: CommentNode,
1075                         Data: p.tok.Data,
1076                 })
1077                 return true
1078         }
1079
1080         switch p.top().Data {
1081         case "table", "tbody", "tfoot", "thead", "tr":
1082                 p.fosterParenting = true
1083                 defer func() { p.fosterParenting = false }()
1084         }
1085
1086         return inBodyIM(p)
1087 }
1088
1089 // clearStackToContext pops elements off the stack of open elements
1090 // until an element listed in stopTags is found.
1091 func (p *parser) clearStackToContext(stopTags []string) {
1092         for i := len(p.oe) - 1; i >= 0; i-- {
1093                 for _, tag := range stopTags {
1094                         if p.oe[i].Data == tag {
1095                                 p.oe = p.oe[:i+1]
1096                                 return
1097                         }
1098                 }
1099         }
1100 }
1101
1102 // Section 12.2.5.4.11.
1103 func inCaptionIM(p *parser) bool {
1104         switch p.tok.Type {
1105         case StartTagToken:
1106                 switch p.tok.Data {
1107                 case "caption", "col", "colgroup", "tbody", "td", "tfoot", "thead", "tr":
1108                         if p.popUntil(tableScopeStopTags, "caption") {
1109                                 p.clearActiveFormattingElements()
1110                                 p.im = inTableIM
1111                                 return false
1112                         } else {
1113                                 // Ignore the token.
1114                                 return true
1115                         }
1116                 }
1117         case EndTagToken:
1118                 switch p.tok.Data {
1119                 case "caption":
1120                         if p.popUntil(tableScopeStopTags, "caption") {
1121                                 p.clearActiveFormattingElements()
1122                                 p.im = inTableIM
1123                         }
1124                         return true
1125                 case "table":
1126                         if p.popUntil(tableScopeStopTags, "caption") {
1127                                 p.clearActiveFormattingElements()
1128                                 p.im = inTableIM
1129                                 return false
1130                         } else {
1131                                 // Ignore the token.
1132                                 return true
1133                         }
1134                 case "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr":
1135                         // Ignore the token.
1136                         return true
1137                 }
1138         }
1139         return inBodyIM(p)
1140 }
1141
1142 // Section 12.2.5.4.12.
1143 func inColumnGroupIM(p *parser) bool {
1144         switch p.tok.Type {
1145         case CommentToken:
1146                 p.addChild(&Node{
1147                         Type: CommentNode,
1148                         Data: p.tok.Data,
1149                 })
1150                 return true
1151         case DoctypeToken:
1152                 // Ignore the token.
1153                 return true
1154         case StartTagToken:
1155                 switch p.tok.Data {
1156                 case "html":
1157                         return inBodyIM(p)
1158                 case "col":
1159                         p.addElement(p.tok.Data, p.tok.Attr)
1160                         p.oe.pop()
1161                         p.acknowledgeSelfClosingTag()
1162                         return true
1163                 }
1164         case EndTagToken:
1165                 switch p.tok.Data {
1166                 case "colgroup":
1167                         if p.oe.top().Data != "html" {
1168                                 p.oe.pop()
1169                                 p.im = inTableIM
1170                         }
1171                         return true
1172                 case "col":
1173                         // Ignore the token.
1174                         return true
1175                 }
1176         }
1177         if p.oe.top().Data != "html" {
1178                 p.oe.pop()
1179                 p.im = inTableIM
1180                 return false
1181         }
1182         return true
1183 }
1184
1185 // Section 12.2.5.4.13.
1186 func inTableBodyIM(p *parser) bool {
1187         var (
1188                 add      bool
1189                 data     string
1190                 attr     []Attribute
1191                 consumed bool
1192         )
1193         switch p.tok.Type {
1194         case ErrorToken:
1195                 // TODO.
1196         case TextToken:
1197                 // TODO.
1198         case StartTagToken:
1199                 switch p.tok.Data {
1200                 case "tr":
1201                         add = true
1202                         data = p.tok.Data
1203                         attr = p.tok.Attr
1204                         consumed = true
1205                 case "td", "th":
1206                         add = true
1207                         data = "tr"
1208                         consumed = false
1209                 default:
1210                         // TODO.
1211                 }
1212         case EndTagToken:
1213                 switch p.tok.Data {
1214                 case "table":
1215                         if p.popUntil(tableScopeStopTags, "tbody", "thead", "tfoot") {
1216                                 p.im = inTableIM
1217                                 return false
1218                         }
1219                         // Ignore the token.
1220                         return true
1221                 case "body", "caption", "col", "colgroup", "html", "td", "th", "tr":
1222                         // Ignore the token.
1223                         return true
1224                 }
1225         case CommentToken:
1226                 p.addChild(&Node{
1227                         Type: CommentNode,
1228                         Data: p.tok.Data,
1229                 })
1230                 return true
1231         }
1232         if add {
1233                 // TODO: clear the stack back to a table body context.
1234                 p.addElement(data, attr)
1235                 p.im = inRowIM
1236                 return consumed
1237         }
1238         return inTableIM(p)
1239 }
1240
1241 // Section 12.2.5.4.14.
1242 func inRowIM(p *parser) bool {
1243         switch p.tok.Type {
1244         case ErrorToken:
1245                 // TODO.
1246         case TextToken:
1247                 // TODO.
1248         case StartTagToken:
1249                 switch p.tok.Data {
1250                 case "td", "th":
1251                         p.clearStackToContext(tableRowContextStopTags)
1252                         p.addElement(p.tok.Data, p.tok.Attr)
1253                         p.afe = append(p.afe, &scopeMarker)
1254                         p.im = inCellIM
1255                         return true
1256                 case "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr":
1257                         if p.popUntil(tableScopeStopTags, "tr") {
1258                                 p.im = inTableBodyIM
1259                                 return false
1260                         }
1261                         // Ignore the token.
1262                         return true
1263                 default:
1264                         // TODO.
1265                 }
1266         case EndTagToken:
1267                 switch p.tok.Data {
1268                 case "tr":
1269                         if p.popUntil(tableScopeStopTags, "tr") {
1270                                 p.im = inTableBodyIM
1271                                 return true
1272                         }
1273                         // Ignore the token.
1274                         return true
1275                 case "table":
1276                         if p.popUntil(tableScopeStopTags, "tr") {
1277                                 p.im = inTableBodyIM
1278                                 return false
1279                         }
1280                         // Ignore the token.
1281                         return true
1282                 case "tbody", "tfoot", "thead":
1283                         // TODO.
1284                 case "body", "caption", "col", "colgroup", "html", "td", "th":
1285                         // Ignore the token.
1286                         return true
1287                 default:
1288                         // TODO.
1289                 }
1290         case CommentToken:
1291                 p.addChild(&Node{
1292                         Type: CommentNode,
1293                         Data: p.tok.Data,
1294                 })
1295                 return true
1296         }
1297         return inTableIM(p)
1298 }
1299
1300 // Section 12.2.5.4.15.
1301 func inCellIM(p *parser) bool {
1302         var (
1303                 closeTheCellAndReprocess bool
1304         )
1305         switch p.tok.Type {
1306         case StartTagToken:
1307                 switch p.tok.Data {
1308                 case "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr":
1309                         // TODO: check for "td" or "th" in table scope.
1310                         closeTheCellAndReprocess = true
1311                 }
1312         case EndTagToken:
1313                 switch p.tok.Data {
1314                 case "td", "th":
1315                         if !p.popUntil(tableScopeStopTags, p.tok.Data) {
1316                                 // Ignore the token.
1317                                 return true
1318                         }
1319                         p.clearActiveFormattingElements()
1320                         p.im = inRowIM
1321                         return true
1322                 case "body", "caption", "col", "colgroup", "html":
1323                         // TODO.
1324                 case "table", "tbody", "tfoot", "thead", "tr":
1325                         // TODO: check for matching element in table scope.
1326                         closeTheCellAndReprocess = true
1327                 }
1328         case CommentToken:
1329                 p.addChild(&Node{
1330                         Type: CommentNode,
1331                         Data: p.tok.Data,
1332                 })
1333                 return true
1334         }
1335         if closeTheCellAndReprocess {
1336                 if p.popUntil(tableScopeStopTags, "td") || p.popUntil(tableScopeStopTags, "th") {
1337                         p.clearActiveFormattingElements()
1338                         p.im = inRowIM
1339                         return false
1340                 }
1341         }
1342         return inBodyIM(p)
1343 }
1344
1345 // Section 12.2.5.4.16.
1346 func inSelectIM(p *parser) bool {
1347         endSelect := false
1348         switch p.tok.Type {
1349         case ErrorToken:
1350                 // TODO.
1351         case TextToken:
1352                 p.addText(p.tok.Data)
1353         case StartTagToken:
1354                 switch p.tok.Data {
1355                 case "html":
1356                         // TODO.
1357                 case "option":
1358                         if p.top().Data == "option" {
1359                                 p.oe.pop()
1360                         }
1361                         p.addElement(p.tok.Data, p.tok.Attr)
1362                 case "optgroup":
1363                         if p.top().Data == "option" {
1364                                 p.oe.pop()
1365                         }
1366                         if p.top().Data == "optgroup" {
1367                                 p.oe.pop()
1368                         }
1369                         p.addElement(p.tok.Data, p.tok.Attr)
1370                 case "select":
1371                         endSelect = true
1372                 case "input", "keygen", "textarea":
1373                         // TODO.
1374                 case "script":
1375                         // TODO.
1376                 default:
1377                         // Ignore the token.
1378                 }
1379         case EndTagToken:
1380                 switch p.tok.Data {
1381                 case "option":
1382                         if p.top().Data == "option" {
1383                                 p.oe.pop()
1384                         }
1385                 case "optgroup":
1386                         i := len(p.oe) - 1
1387                         if p.oe[i].Data == "option" {
1388                                 i--
1389                         }
1390                         if p.oe[i].Data == "optgroup" {
1391                                 p.oe = p.oe[:i]
1392                         }
1393                 case "select":
1394                         endSelect = true
1395                 default:
1396                         // Ignore the token.
1397                 }
1398         case CommentToken:
1399                 p.doc.Add(&Node{
1400                         Type: CommentNode,
1401                         Data: p.tok.Data,
1402                 })
1403         }
1404         if endSelect {
1405                 for i := len(p.oe) - 1; i >= 0; i-- {
1406                         switch p.oe[i].Data {
1407                         case "select":
1408                                 p.oe = p.oe[:i]
1409                                 p.resetInsertionMode()
1410                                 return true
1411                         case "option", "optgroup":
1412                                 continue
1413                         default:
1414                                 // Ignore the token.
1415                                 return true
1416                         }
1417                 }
1418         }
1419         return true
1420 }
1421
1422 // Section 12.2.5.4.18.
1423 func afterBodyIM(p *parser) bool {
1424         switch p.tok.Type {
1425         case ErrorToken:
1426                 // Stop parsing.
1427                 return true
1428         case StartTagToken:
1429                 if p.tok.Data == "html" {
1430                         return inBodyIM(p)
1431                 }
1432         case EndTagToken:
1433                 if p.tok.Data == "html" {
1434                         p.im = afterAfterBodyIM
1435                         return true
1436                 }
1437         case CommentToken:
1438                 // The comment is attached to the <html> element.
1439                 if len(p.oe) < 1 || p.oe[0].Data != "html" {
1440                         panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
1441                 }
1442                 p.oe[0].Add(&Node{
1443                         Type: CommentNode,
1444                         Data: p.tok.Data,
1445                 })
1446                 return true
1447         }
1448         p.im = inBodyIM
1449         return false
1450 }
1451
1452 // Section 12.2.5.4.19.
1453 func inFramesetIM(p *parser) bool {
1454         switch p.tok.Type {
1455         case CommentToken:
1456                 p.addChild(&Node{
1457                         Type: CommentNode,
1458                         Data: p.tok.Data,
1459                 })
1460         case TextToken:
1461                 // Ignore all text but whitespace.
1462                 s := strings.Map(func(c rune) rune {
1463                         switch c {
1464                         case ' ', '\t', '\n', '\f', '\r':
1465                                 return c
1466                         }
1467                         return -1
1468                 }, p.tok.Data)
1469                 if s != "" {
1470                         p.addText(s)
1471                 }
1472         case StartTagToken:
1473                 switch p.tok.Data {
1474                 case "html":
1475                         return inBodyIM(p)
1476                 case "frameset":
1477                         p.addElement(p.tok.Data, p.tok.Attr)
1478                 case "frame":
1479                         p.addElement(p.tok.Data, p.tok.Attr)
1480                         p.oe.pop()
1481                         p.acknowledgeSelfClosingTag()
1482                 case "noframes":
1483                         return inHeadIM(p)
1484                 }
1485         case EndTagToken:
1486                 switch p.tok.Data {
1487                 case "frameset":
1488                         if p.oe.top().Data != "html" {
1489                                 p.oe.pop()
1490                                 if p.oe.top().Data != "frameset" {
1491                                         p.im = afterFramesetIM
1492                                         return true
1493                                 }
1494                         }
1495                 }
1496         default:
1497                 // Ignore the token.
1498         }
1499         return true
1500 }
1501
1502 // Section 12.2.5.4.20.
1503 func afterFramesetIM(p *parser) bool {
1504         switch p.tok.Type {
1505         case CommentToken:
1506                 p.addChild(&Node{
1507                         Type: CommentNode,
1508                         Data: p.tok.Data,
1509                 })
1510         case TextToken:
1511                 // Ignore all text but whitespace.
1512                 s := strings.Map(func(c rune) rune {
1513                         switch c {
1514                         case ' ', '\t', '\n', '\f', '\r':
1515                                 return c
1516                         }
1517                         return -1
1518                 }, p.tok.Data)
1519                 if s != "" {
1520                         p.addText(s)
1521                 }
1522         case StartTagToken:
1523                 switch p.tok.Data {
1524                 case "html":
1525                         return inBodyIM(p)
1526                 case "noframes":
1527                         return inHeadIM(p)
1528                 }
1529         case EndTagToken:
1530                 switch p.tok.Data {
1531                 case "html":
1532                         p.im = afterAfterFramesetIM
1533                         return true
1534                 }
1535         default:
1536                 // Ignore the token.
1537         }
1538         return true
1539 }
1540
1541 // Section 12.2.5.4.21.
1542 func afterAfterBodyIM(p *parser) bool {
1543         switch p.tok.Type {
1544         case ErrorToken:
1545                 // Stop parsing.
1546                 return true
1547         case TextToken:
1548                 // TODO.
1549         case StartTagToken:
1550                 if p.tok.Data == "html" {
1551                         return inBodyIM(p)
1552                 }
1553         case CommentToken:
1554                 p.doc.Add(&Node{
1555                         Type: CommentNode,
1556                         Data: p.tok.Data,
1557                 })
1558                 return true
1559         }
1560         p.im = inBodyIM
1561         return false
1562 }
1563
1564 // Section 12.2.5.4.22.
1565 func afterAfterFramesetIM(p *parser) bool {
1566         switch p.tok.Type {
1567         case CommentToken:
1568                 p.addChild(&Node{
1569                         Type: CommentNode,
1570                         Data: p.tok.Data,
1571                 })
1572         case StartTagToken:
1573                 switch p.tok.Data {
1574                 case "html":
1575                         return inBodyIM(p)
1576                 case "noframes":
1577                         return inHeadIM(p)
1578                 }
1579         default:
1580                 // Ignore the token.
1581         }
1582         return true
1583 }
1584
1585 // Section 12.2.5.5.
1586 func inForeignContentIM(p *parser) bool {
1587         switch p.tok.Type {
1588         case CommentToken:
1589                 p.addChild(&Node{
1590                         Type: CommentNode,
1591                         Data: p.tok.Data,
1592                 })
1593         case StartTagToken:
1594                 if breakout[p.tok.Data] {
1595                         // TODO.
1596                 }
1597                 switch p.top().Namespace {
1598                 case "mathml":
1599                         // TODO: adjust MathML attributes.
1600                 case "svg":
1601                         // TODO: adjust SVG tag names.
1602                         // TODO: adjust SVG attributes.
1603                 default:
1604                         panic("html: bad parser state: unexpected namespace")
1605                 }
1606                 // TODO: adjust foreign attributes.
1607                 p.addElement(p.tok.Data, p.tok.Attr)
1608         case EndTagToken:
1609                 // TODO.
1610         default:
1611                 // Ignore the token.
1612         }
1613         return true
1614 }
1615
1616 func (p *parser) parse() error {
1617         // Iterate until EOF. Any other error will cause an early return.
1618         consumed := true
1619         for {
1620                 if consumed {
1621                         if err := p.read(); err != nil {
1622                                 if err == io.EOF {
1623                                         break
1624                                 }
1625                                 return err
1626                         }
1627                 }
1628                 consumed = p.im(p)
1629         }
1630         // Loop until the final token (the ErrorToken signifying EOF) is consumed.
1631         for {
1632                 if consumed = p.im(p); consumed {
1633                         break
1634                 }
1635         }
1636         return nil
1637 }
1638
1639 // Parse returns the parse tree for the HTML from the given Reader.
1640 // The input is assumed to be UTF-8 encoded.
1641 func Parse(r io.Reader) (*Node, error) {
1642         p := &parser{
1643                 tokenizer: NewTokenizer(r),
1644                 doc: &Node{
1645                         Type: DocumentNode,
1646                 },
1647                 scripting:  true,
1648                 framesetOK: true,
1649                 im:         initialIM,
1650         }
1651         err := p.parse()
1652         if err != nil {
1653                 return nil, err
1654         }
1655         return p.doc, nil
1656 }
1657
1658 // ParseFragment parses a fragment of HTML and returns the nodes that were 
1659 // found. If the fragment is the InnerHTML for an existing element, pass that
1660 // element in context.
1661 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
1662         p := &parser{
1663                 tokenizer: NewTokenizer(r),
1664                 doc: &Node{
1665                         Type: DocumentNode,
1666                 },
1667                 scripting: true,
1668                 context:   context,
1669         }
1670
1671         if context != nil {
1672                 switch context.Data {
1673                 case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
1674                         p.tokenizer.rawTag = context.Data
1675                 }
1676         }
1677
1678         root := &Node{
1679                 Type: ElementNode,
1680                 Data: "html",
1681         }
1682         p.doc.Add(root)
1683         p.oe = nodeStack{root}
1684         p.resetInsertionMode()
1685
1686         for n := context; n != nil; n = n.Parent {
1687                 if n.Type == ElementNode && n.Data == "form" {
1688                         p.form = n
1689                         break
1690                 }
1691         }
1692
1693         err := p.parse()
1694         if err != nil {
1695                 return nil, err
1696         }
1697
1698         parent := p.doc
1699         if context != nil {
1700                 parent = root
1701         }
1702
1703         result := parent.Child
1704         parent.Child = nil
1705         for _, n := range result {
1706                 n.Parent = nil
1707         }
1708         return result, nil
1709 }