libgo/go/html/token.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "io"
  10         "strconv"
  11         "strings"
  12 )
  13
  14 // A TokenType is the type of a Token.
  15 type TokenType int
  16
  17 const (
  18         // ErrorToken means that an error occurred during tokenization.
  19         ErrorToken TokenType = iota
  20         // TextToken means a text node.
  21         TextToken
  22         // A StartTagToken looks like <a>.
  23         StartTagToken
  24         // An EndTagToken looks like </a>.
  25         EndTagToken
  26         // A SelfClosingTagToken tag looks like <br/>.
  27         SelfClosingTagToken
  28         // A CommentToken looks like <!--x-->.
  29         CommentToken
  30         // A DoctypeToken looks like <!DOCTYPE x>
  31         DoctypeToken
  32 )
  33
  34 // String returns a string representation of the TokenType.
  35 func (t TokenType) String() string {
  36         switch t {
  37         case ErrorToken:
  38                 return "Error"
  39         case TextToken:
  40                 return "Text"
  41         case StartTagToken:
  42                 return "StartTag"
  43         case EndTagToken:
  44                 return "EndTag"
  45         case SelfClosingTagToken:
  46                 return "SelfClosingTag"
  47         case CommentToken:
  48                 return "Comment"
  49         case DoctypeToken:
  50                 return "Doctype"
  51         }
  52         return "Invalid(" + strconv.Itoa(int(t)) + ")"
  53 }
  54
  55 // An Attribute is an attribute key-value pair. Key is alphabetic (and hence
  56 // does not contain escapable characters like '&', '<' or '>'), and Val is
  57 // unescaped (it looks like "a<b" rather than "a&lt;b").
  58 type Attribute struct {
  59         Key, Val string
  60 }
  61
  62 // A Token consists of a TokenType and some Data (tag name for start and end
  63 // tags, content for text, comments and doctypes). A tag Token may also contain
  64 // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
  65 // rather than "a&lt;b").
  66 type Token struct {
  67         Type TokenType
  68         Data string
  69         Attr []Attribute
  70 }
  71
  72 // tagString returns a string representation of a tag Token's Data and Attr.
  73 func (t Token) tagString() string {
  74         if len(t.Attr) == 0 {
  75                 return t.Data
  76         }
  77         buf := bytes.NewBuffer(nil)
  78         buf.WriteString(t.Data)
  79         for _, a := range t.Attr {
  80                 buf.WriteByte(' ')
  81                 buf.WriteString(a.Key)
  82                 buf.WriteString(`="`)
  83                 escape(buf, a.Val)
  84                 buf.WriteByte('"')
  85         }
  86         return buf.String()
  87 }
  88
  89 // String returns a string representation of the Token.
  90 func (t Token) String() string {
  91         switch t.Type {
  92         case ErrorToken:
  93                 return ""
  94         case TextToken:
  95                 return EscapeString(t.Data)
  96         case StartTagToken:
  97                 return "<" + t.tagString() + ">"
  98         case EndTagToken:
  99                 return "</" + t.tagString() + ">"
 100         case SelfClosingTagToken:
 101                 return "<" + t.tagString() + "/>"
 102         case CommentToken:
 103                 return "<!--" + t.Data + "-->"
 104         case DoctypeToken:
 105                 return "<!DOCTYPE " + t.Data + ">"
 106         }
 107         return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
 108 }
 109
 110 // span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
 111 // the end is exclusive.
 112 type span struct {
 113         start, end int
 114 }
 115
 116 // A Tokenizer returns a stream of HTML Tokens.
 117 type Tokenizer struct {
 118         // r is the source of the HTML text.
 119         r io.Reader
 120         // tt is the TokenType of the current token.
 121         tt TokenType
 122         // err is the first error encountered during tokenization. It is possible
 123         // for tt != Error && err != nil to hold: this means that Next returned a
 124         // valid token but the subsequent Next call will return an error token.
 125         // For example, if the HTML text input was just "plain", then the first
 126         // Next call would set z.err to os.EOF but return a TextToken, and all
 127         // subsequent Next calls would return an ErrorToken.
 128         // err is never reset. Once it becomes non-nil, it stays non-nil.
 129         err error
 130         // buf[raw.start:raw.end] holds the raw bytes of the current token.
 131         // buf[raw.end:] is buffered input that will yield future tokens.
 132         raw span
 133         buf []byte
 134         // buf[data.start:data.end] holds the raw bytes of the current token's data:
 135         // a text token's text, a tag token's tag name, etc.
 136         data span
 137         // pendingAttr is the attribute key and value currently being tokenized.
 138         // When complete, pendingAttr is pushed onto attr. nAttrReturned is
 139         // incremented on each call to TagAttr.
 140         pendingAttr   [2]span
 141         attr          [][2]span
 142         nAttrReturned int
 143         // rawTag is the "script" in "</script>" that closes the next token. If
 144         // non-empty, the subsequent call to Next will return a raw or RCDATA text
 145         // token: one that treats "<p>" as text instead of an element.
 146         // rawTag's contents are lower-cased.
 147         rawTag string
 148         // textIsRaw is whether the current text token's data is not escaped.
 149         textIsRaw bool
 150 }
 151
 152 // Error returns the error associated with the most recent ErrorToken token.
 153 // This is typically os.EOF, meaning the end of tokenization.
 154 func (z *Tokenizer) Error() error {
 155         if z.tt != ErrorToken {
 156                 return nil
 157         }
 158         return z.err
 159 }
 160
 161 // readByte returns the next byte from the input stream, doing a buffered read
 162 // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte
 163 // slice that holds all the bytes read so far for the current token.
 164 // It sets z.err if the underlying reader returns an error.
 165 // Pre-condition: z.err == nil.
 166 func (z *Tokenizer) readByte() byte {
 167         if z.raw.end >= len(z.buf) {
 168                 // Our buffer is exhausted and we have to read from z.r.
 169                 // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length
 170                 // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we
 171                 // allocate a new buffer before the copy.
 172                 c := cap(z.buf)
 173                 d := z.raw.end - z.raw.start
 174                 var buf1 []byte
 175                 if 2*d > c {
 176                         buf1 = make([]byte, d, 2*c)
 177                 } else {
 178                         buf1 = z.buf[:d]
 179                 }
 180                 copy(buf1, z.buf[z.raw.start:z.raw.end])
 181                 if x := z.raw.start; x != 0 {
 182                         // Adjust the data/attr spans to refer to the same contents after the copy.
 183                         z.data.start -= x
 184                         z.data.end -= x
 185                         z.pendingAttr[0].start -= x
 186                         z.pendingAttr[0].end -= x
 187                         z.pendingAttr[1].start -= x
 188                         z.pendingAttr[1].end -= x
 189                         for i := range z.attr {
 190                                 z.attr[i][0].start -= x
 191                                 z.attr[i][0].end -= x
 192                                 z.attr[i][1].start -= x
 193                                 z.attr[i][1].end -= x
 194                         }
 195                 }
 196                 z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d]
 197                 // Now that we have copied the live bytes to the start of the buffer,
 198                 // we read from z.r into the remainder.
 199                 n, err := z.r.Read(buf1[d:cap(buf1)])
 200                 if err != nil {
 201                         z.err = err
 202                         return 0
 203                 }
 204                 z.buf = buf1[:d+n]
 205         }
 206         x := z.buf[z.raw.end]
 207         z.raw.end++
 208         return x
 209 }
 210
 211 // skipWhiteSpace skips past any white space.
 212 func (z *Tokenizer) skipWhiteSpace() {
 213         if z.err != nil {
 214                 return
 215         }
 216         for {
 217                 c := z.readByte()
 218                 if z.err != nil {
 219                         return
 220                 }
 221                 switch c {
 222                 case ' ', '\n', '\r', '\t', '\f':
 223                         // No-op.
 224                 default:
 225                         z.raw.end--
 226                         return
 227                 }
 228         }
 229 }
 230
 231 // readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
 232 // is typically something like "script" or "textarea".
 233 func (z *Tokenizer) readRawOrRCDATA() {
 234 loop:
 235         for {
 236                 c := z.readByte()
 237                 if z.err != nil {
 238                         break loop
 239                 }
 240                 if c != '<' {
 241                         continue loop
 242                 }
 243                 c = z.readByte()
 244                 if z.err != nil {
 245                         break loop
 246                 }
 247                 if c != '/' {
 248                         continue loop
 249                 }
 250                 for i := 0; i < len(z.rawTag); i++ {
 251                         c = z.readByte()
 252                         if z.err != nil {
 253                                 break loop
 254                         }
 255                         if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
 256                                 continue loop
 257                         }
 258                 }
 259                 c = z.readByte()
 260                 if z.err != nil {
 261                         break loop
 262                 }
 263                 switch c {
 264                 case ' ', '\n', '\r', '\t', '\f', '/', '>':
 265                         // The 3 is 2 for the leading "</" plus 1 for the trailing character c.
 266                         z.raw.end -= 3 + len(z.rawTag)
 267                         break loop
 268                 case '<':
 269                         // Step back one, to catch "</foo</foo>".
 270                         z.raw.end--
 271                 }
 272         }
 273         z.data.end = z.raw.end
 274         // A textarea's or title's RCDATA can contain escaped entities.
 275         z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
 276         z.rawTag = ""
 277 }
 278
 279 // readComment reads the next comment token starting with "<!--". The opening
 280 // "<!--" has already been consumed.
 281 func (z *Tokenizer) readComment() {
 282         z.data.start = z.raw.end
 283         defer func() {
 284                 if z.data.end < z.data.start {
 285                         // It's a comment with no data, like <!-->.
 286                         z.data.end = z.data.start
 287                 }
 288         }()
 289         for dashCount := 2; ; {
 290                 c := z.readByte()
 291                 if z.err != nil {
 292                         z.data.end = z.raw.end
 293                         return
 294                 }
 295                 switch c {
 296                 case '-':
 297                         dashCount++
 298                         continue
 299                 case '>':
 300                         if dashCount >= 2 {
 301                                 z.data.end = z.raw.end - len("-->")
 302                                 return
 303                         }
 304                 case '!':
 305                         if dashCount >= 2 {
 306                                 c = z.readByte()
 307                                 if z.err != nil {
 308                                         z.data.end = z.raw.end
 309                                         return
 310                                 }
 311                                 if c == '>' {
 312                                         z.data.end = z.raw.end - len("--!>")
 313                                         return
 314                                 }
 315                         }
 316                 }
 317                 dashCount = 0
 318         }
 319 }
 320
 321 // readUntilCloseAngle reads until the next ">".
 322 func (z *Tokenizer) readUntilCloseAngle() {
 323         z.data.start = z.raw.end
 324         for {
 325                 c := z.readByte()
 326                 if z.err != nil {
 327                         z.data.end = z.raw.end
 328                         return
 329                 }
 330                 if c == '>' {
 331                         z.data.end = z.raw.end - len(">")
 332                         return
 333                 }
 334         }
 335 }
 336
 337 // readMarkupDeclaration reads the next token starting with "<!". It might be
 338 // a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening
 339 // "<!" has already been consumed.
 340 func (z *Tokenizer) readMarkupDeclaration() TokenType {
 341         z.data.start = z.raw.end
 342         var c [2]byte
 343         for i := 0; i < 2; i++ {
 344                 c[i] = z.readByte()
 345                 if z.err != nil {
 346                         z.data.end = z.raw.end
 347                         return CommentToken
 348                 }
 349         }
 350         if c[0] == '-' && c[1] == '-' {
 351                 z.readComment()
 352                 return CommentToken
 353         }
 354         z.raw.end -= 2
 355         const s = "DOCTYPE"
 356         for i := 0; i < len(s); i++ {
 357                 c := z.readByte()
 358                 if z.err != nil {
 359                         z.data.end = z.raw.end
 360                         return CommentToken
 361                 }
 362                 if c != s[i] && c != s[i]+('a'-'A') {
 363                         // Back up to read the fragment of "DOCTYPE" again.
 364                         z.raw.end = z.data.start
 365                         z.readUntilCloseAngle()
 366                         return CommentToken
 367                 }
 368         }
 369         if z.skipWhiteSpace(); z.err != nil {
 370                 z.data.start = z.raw.end
 371                 z.data.end = z.raw.end
 372                 return DoctypeToken
 373         }
 374         z.readUntilCloseAngle()
 375         return DoctypeToken
 376 }
 377
 378 // readStartTag reads the next start tag token. The opening "<a" has already
 379 // been consumed, where 'a' means anything in [A-Za-z].
 380 func (z *Tokenizer) readStartTag() TokenType {
 381         z.attr = z.attr[:0]
 382         z.nAttrReturned = 0
 383         // Read the tag name and attribute key/value pairs.
 384         z.readTagName()
 385         if z.skipWhiteSpace(); z.err != nil {
 386                 return ErrorToken
 387         }
 388         for {
 389                 c := z.readByte()
 390                 if z.err != nil || c == '>' {
 391                         break
 392                 }
 393                 z.raw.end--
 394                 z.readTagAttrKey()
 395                 z.readTagAttrVal()
 396                 // Save pendingAttr if it has a non-empty key.
 397                 if z.pendingAttr[0].start != z.pendingAttr[0].end {
 398                         z.attr = append(z.attr, z.pendingAttr)
 399                 }
 400                 if z.skipWhiteSpace(); z.err != nil {
 401                         break
 402                 }
 403         }
 404         // Any "<noembed>", "<noframes>", "<noscript>", "<script>", "<style>",
 405         // "<textarea>" or "<title>" tag flags the tokenizer's next token as raw.
 406         // The tag name lengths of these special cases ranges in [5, 8].
 407         if x := z.data.end - z.data.start; 5 <= x && x <= 8 {
 408                 switch z.buf[z.data.start] {
 409                 case 'n', 's', 't', 'N', 'S', 'T':
 410                         switch s := strings.ToLower(string(z.buf[z.data.start:z.data.end])); s {
 411                         case "noembed", "noframes", "noscript", "script", "style", "textarea", "title":
 412                                 z.rawTag = s
 413                         }
 414                 }
 415         }
 416         // Look for a self-closing token like "<br/>".
 417         if z.err == nil && z.buf[z.raw.end-2] == '/' {
 418                 return SelfClosingTagToken
 419         }
 420         return StartTagToken
 421 }
 422
 423 // readEndTag reads the next end tag token. The opening "</a" has already
 424 // been consumed, where 'a' means anything in [A-Za-z].
 425 func (z *Tokenizer) readEndTag() {
 426         z.attr = z.attr[:0]
 427         z.nAttrReturned = 0
 428         z.readTagName()
 429         for {
 430                 c := z.readByte()
 431                 if z.err != nil || c == '>' {
 432                         return
 433                 }
 434         }
 435 }
 436
 437 // readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end)
 438 // is positioned such that the first byte of the tag name (the "d" in "<div")
 439 // has already been consumed.
 440 func (z *Tokenizer) readTagName() {
 441         z.data.start = z.raw.end - 1
 442         for {
 443                 c := z.readByte()
 444                 if z.err != nil {
 445                         z.data.end = z.raw.end
 446                         return
 447                 }
 448                 switch c {
 449                 case ' ', '\n', '\r', '\t', '\f':
 450                         z.data.end = z.raw.end - 1
 451                         return
 452                 case '/', '>':
 453                         z.raw.end--
 454                         z.data.end = z.raw.end
 455                         return
 456                 }
 457         }
 458 }
 459
 460 // readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>".
 461 // Precondition: z.err == nil.
 462 func (z *Tokenizer) readTagAttrKey() {
 463         z.pendingAttr[0].start = z.raw.end
 464         for {
 465                 c := z.readByte()
 466                 if z.err != nil {
 467                         z.pendingAttr[0].end = z.raw.end
 468                         return
 469                 }
 470                 switch c {
 471                 case ' ', '\n', '\r', '\t', '\f', '/':
 472                         z.pendingAttr[0].end = z.raw.end - 1
 473                         return
 474                 case '=', '>':
 475                         z.raw.end--
 476                         z.pendingAttr[0].end = z.raw.end
 477                         return
 478                 }
 479         }
 480 }
 481
 482 // readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>".
 483 func (z *Tokenizer) readTagAttrVal() {
 484         z.pendingAttr[1].start = z.raw.end
 485         z.pendingAttr[1].end = z.raw.end
 486         if z.skipWhiteSpace(); z.err != nil {
 487                 return
 488         }
 489         c := z.readByte()
 490         if z.err != nil {
 491                 return
 492         }
 493         if c != '=' {
 494                 z.raw.end--
 495                 return
 496         }
 497         if z.skipWhiteSpace(); z.err != nil {
 498                 return
 499         }
 500         quote := z.readByte()
 501         if z.err != nil {
 502                 return
 503         }
 504         switch quote {
 505         case '>':
 506                 z.raw.end--
 507                 return
 508
 509         case '\'', '"':
 510                 z.pendingAttr[1].start = z.raw.end
 511                 for {
 512                         c := z.readByte()
 513                         if z.err != nil {
 514                                 z.pendingAttr[1].end = z.raw.end
 515                                 return
 516                         }
 517                         if c == quote {
 518                                 z.pendingAttr[1].end = z.raw.end - 1
 519                                 return
 520                         }
 521                 }
 522
 523         default:
 524                 z.pendingAttr[1].start = z.raw.end - 1
 525                 for {
 526                         c := z.readByte()
 527                         if z.err != nil {
 528                                 z.pendingAttr[1].end = z.raw.end
 529                                 return
 530                         }
 531                         switch c {
 532                         case ' ', '\n', '\r', '\t', '\f':
 533                                 z.pendingAttr[1].end = z.raw.end - 1
 534                                 return
 535                         case '>':
 536                                 z.raw.end--
 537                                 z.pendingAttr[1].end = z.raw.end
 538                                 return
 539                         }
 540                 }
 541         }
 542 }
 543
 544 // Next scans the next token and returns its type.
 545 func (z *Tokenizer) Next() TokenType {
 546         if z.err != nil {
 547                 z.tt = ErrorToken
 548                 return z.tt
 549         }
 550         z.raw.start = z.raw.end
 551         z.data.start = z.raw.end
 552         z.data.end = z.raw.end
 553         if z.rawTag != "" {
 554                 z.readRawOrRCDATA()
 555                 z.tt = TextToken
 556                 return z.tt
 557         }
 558         z.textIsRaw = false
 559
 560 loop:
 561         for {
 562                 c := z.readByte()
 563                 if z.err != nil {
 564                         break loop
 565                 }
 566                 if c != '<' {
 567                         continue loop
 568                 }
 569
 570                 // Check if the '<' we have just read is part of a tag, comment
 571                 // or doctype. If not, it's part of the accumulated text token.
 572                 c = z.readByte()
 573                 if z.err != nil {
 574                         break loop
 575                 }
 576                 var tokenType TokenType
 577                 switch {
 578                 case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
 579                         tokenType = StartTagToken
 580                 case c == '/':
 581                         tokenType = EndTagToken
 582                 case c == '!' || c == '?':
 583                         // We use CommentToken to mean any of "<!--actual comments-->",
 584                         // "<!DOCTYPE declarations>" and "<?xml processing instructions?>".
 585                         tokenType = CommentToken
 586                 default:
 587                         continue
 588                 }
 589
 590                 // We have a non-text token, but we might have accumulated some text
 591                 // before that. If so, we return the text first, and return the non-
 592                 // text token on the subsequent call to Next.
 593                 if x := z.raw.end - len("<a"); z.raw.start < x {
 594                         z.raw.end = x
 595                         z.data.end = x
 596                         z.tt = TextToken
 597                         return z.tt
 598                 }
 599                 switch tokenType {
 600                 case StartTagToken:
 601                         z.tt = z.readStartTag()
 602                         return z.tt
 603                 case EndTagToken:
 604                         c = z.readByte()
 605                         if z.err != nil {
 606                                 break loop
 607                         }
 608                         if c == '>' {
 609                                 // "</>" does not generate a token at all.
 610                                 // Reset the tokenizer state and start again.
 611                                 z.raw.start = z.raw.end
 612                                 z.data.start = z.raw.end
 613                                 z.data.end = z.raw.end
 614                                 continue loop
 615                         }
 616                         if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
 617                                 z.readEndTag()
 618                                 z.tt = EndTagToken
 619                                 return z.tt
 620                         }
 621                         z.raw.end--
 622                         z.readUntilCloseAngle()
 623                         z.tt = CommentToken
 624                         return z.tt
 625                 case CommentToken:
 626                         if c == '!' {
 627                                 z.tt = z.readMarkupDeclaration()
 628                                 return z.tt
 629                         }
 630                         z.raw.end--
 631                         z.readUntilCloseAngle()
 632                         z.tt = CommentToken
 633                         return z.tt
 634                 }
 635         }
 636         if z.raw.start < z.raw.end {
 637                 z.data.end = z.raw.end
 638                 z.tt = TextToken
 639                 return z.tt
 640         }
 641         z.tt = ErrorToken
 642         return z.tt
 643 }
 644
 645 // Raw returns the unmodified text of the current token. Calling Next, Token,
 646 // Text, TagName or TagAttr may change the contents of the returned slice.
 647 func (z *Tokenizer) Raw() []byte {
 648         return z.buf[z.raw.start:z.raw.end]
 649 }
 650
 651 // Text returns the unescaped text of a text, comment or doctype token. The
 652 // contents of the returned slice may change on the next call to Next.
 653 func (z *Tokenizer) Text() []byte {
 654         switch z.tt {
 655         case TextToken, CommentToken, DoctypeToken:
 656                 s := z.buf[z.data.start:z.data.end]
 657                 z.data.start = z.raw.end
 658                 z.data.end = z.raw.end
 659                 if !z.textIsRaw {
 660                         s = unescape(s)
 661                 }
 662                 return s
 663         }
 664         return nil
 665 }
 666
 667 // TagName returns the lower-cased name of a tag token (the `img` out of
 668 // `<IMG SRC="foo">`) and whether the tag has attributes.
 669 // The contents of the returned slice may change on the next call to Next.
 670 func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
 671         if z.data.start < z.data.end {
 672                 switch z.tt {
 673                 case StartTagToken, EndTagToken, SelfClosingTagToken:
 674                         s := z.buf[z.data.start:z.data.end]
 675                         z.data.start = z.raw.end
 676                         z.data.end = z.raw.end
 677                         return lower(s), z.nAttrReturned < len(z.attr)
 678                 }
 679         }
 680         return nil, false
 681 }
 682
 683 // TagAttr returns the lower-cased key and unescaped value of the next unparsed
 684 // attribute for the current tag token and whether there are more attributes.
 685 // The contents of the returned slices may change on the next call to Next.
 686 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
 687         if z.nAttrReturned < len(z.attr) {
 688                 switch z.tt {
 689                 case StartTagToken, SelfClosingTagToken:
 690                         x := z.attr[z.nAttrReturned]
 691                         z.nAttrReturned++
 692                         key = z.buf[x[0].start:x[0].end]
 693                         val = z.buf[x[1].start:x[1].end]
 694                         return lower(key), unescape(val), z.nAttrReturned < len(z.attr)
 695                 }
 696         }
 697         return nil, nil, false
 698 }
 699
 700 // Token returns the next Token. The result's Data and Attr values remain valid
 701 // after subsequent Next calls.
 702 func (z *Tokenizer) Token() Token {
 703         t := Token{Type: z.tt}
 704         switch z.tt {
 705         case TextToken, CommentToken, DoctypeToken:
 706                 t.Data = string(z.Text())
 707         case StartTagToken, SelfClosingTagToken:
 708                 var attr []Attribute
 709                 name, moreAttr := z.TagName()
 710                 for moreAttr {
 711                         var key, val []byte
 712                         key, val, moreAttr = z.TagAttr()
 713                         attr = append(attr, Attribute{string(key), string(val)})
 714                 }
 715                 t.Data = string(name)
 716                 t.Attr = attr
 717         case EndTagToken:
 718                 name, _ := z.TagName()
 719                 t.Data = string(name)
 720         }
 721         return t
 722 }
 723
 724 // NewTokenizer returns a new HTML Tokenizer for the given Reader.
 725 // The input is assumed to be UTF-8 encoded.
 726 func NewTokenizer(r io.Reader) *Tokenizer {
 727         return &Tokenizer{
 728                 r:   r,
 729                 buf: make([]byte, 0, 4096),
 730         }
 731 }