libgo/go/html/token_test.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "os"
  10         "strings"
  11         "testing"
  12 )
  13
  14 type tokenTest struct {
  15         // A short description of the test case.
  16         desc string
  17         // The HTML to parse.
  18         html string
  19         // The string representations of the expected tokens, joined by '$'.
  20         golden string
  21 }
  22
  23 var tokenTests = []tokenTest{
  24         {
  25                 "empty",
  26                 "",
  27                 "",
  28         },
  29         // A single text node. The tokenizer should not break text nodes on whitespace,
  30         // nor should it normalize whitespace within a text node.
  31         {
  32                 "text",
  33                 "foo  bar",
  34                 "foo  bar",
  35         },
  36         // An entity.
  37         {
  38                 "entity",
  39                 "one &lt; two",
  40                 "one &lt; two",
  41         },
  42         // A start, self-closing and end tag. The tokenizer does not care if the start
  43         // and end tokens don't match; that is the job of the parser.
  44         {
  45                 "tags",
  46                 "<a>b<c/>d</e>",
  47                 "<a>$b$<c/>$d$</e>",
  48         },
  49         // Angle brackets that aren't a tag.
  50         {
  51                 "not a tag #0",
  52                 "<",
  53                 "&lt;",
  54         },
  55         {
  56                 "not a tag #1",
  57                 "</",
  58                 "&lt;/",
  59         },
  60         {
  61                 "not a tag #2",
  62                 "</>",
  63                 "",
  64         },
  65         {
  66                 "not a tag #3",
  67                 "a</>b",
  68                 "a$b",
  69         },
  70         {
  71                 "not a tag #4",
  72                 "</ >",
  73                 "<!-- -->",
  74         },
  75         {
  76                 "not a tag #5",
  77                 "</.",
  78                 "<!--.-->",
  79         },
  80         {
  81                 "not a tag #6",
  82                 "</.>",
  83                 "<!--.-->",
  84         },
  85         {
  86                 "not a tag #7",
  87                 "a < b",
  88                 "a &lt; b",
  89         },
  90         {
  91                 "not a tag #8",
  92                 "<.>",
  93                 "&lt;.&gt;",
  94         },
  95         {
  96                 "not a tag #9",
  97                 "a<<<b>>>c",
  98                 "a&lt;&lt;$<b>$&gt;&gt;c",
  99         },
 100         {
 101                 "not a tag #10",
 102                 "if x<0 and y < 0 then x*y>0",
 103                 "if x&lt;0 and y &lt; 0 then x*y&gt;0",
 104         },
 105         // EOF in a tag name.
 106         {
 107                 "tag name eof #0",
 108                 "<a",
 109                 "",
 110         },
 111         {
 112                 "tag name eof #1",
 113                 "<a ",
 114                 "",
 115         },
 116         {
 117                 "tag name eof #2",
 118                 "a<b",
 119                 "a",
 120         },
 121         {
 122                 "tag name eof #3",
 123                 "<a><b",
 124                 "<a>",
 125         },
 126         {
 127                 "tag name eof #4",
 128                 `<a x`,
 129                 `<a x="">`,
 130         },
 131         // Some malformed tags that are missing a '>'.
 132         {
 133                 "malformed tag #0",
 134                 `<p</p>`,
 135                 `<p< p="">`,
 136         },
 137         {
 138                 "malformed tag #1",
 139                 `<p </p>`,
 140                 `<p <="" p="">`,
 141         },
 142         {
 143                 "malformed tag #2",
 144                 `<p id`,
 145                 `<p id="">`,
 146         },
 147         {
 148                 "malformed tag #3",
 149                 `<p id=`,
 150                 `<p id="">`,
 151         },
 152         {
 153                 "malformed tag #4",
 154                 `<p id=>`,
 155                 `<p id="">`,
 156         },
 157         {
 158                 "malformed tag #5",
 159                 `<p id=0`,
 160                 `<p id="0">`,
 161         },
 162         {
 163                 "malformed tag #6",
 164                 `<p id=0</p>`,
 165                 `<p id="0&lt;/p">`,
 166         },
 167         {
 168                 "malformed tag #7",
 169                 `<p id="0</p>`,
 170                 `<p id="0&lt;/p&gt;">`,
 171         },
 172         {
 173                 "malformed tag #8",
 174                 `<p id="0"</p>`,
 175                 `<p id="0" <="" p="">`,
 176         },
 177         // Raw text and RCDATA.
 178         {
 179                 "basic raw text",
 180                 "<script><a></b></script>",
 181                 "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
 182         },
 183         {
 184                 "unfinished script end tag",
 185                 "<SCRIPT>a</SCR",
 186                 "<script>$a&lt;/SCR",
 187         },
 188         {
 189                 "broken script end tag",
 190                 "<SCRIPT>a</SCR ipt>",
 191                 "<script>$a&lt;/SCR ipt&gt;",
 192         },
 193         {
 194                 "EOF in script end tag",
 195                 "<SCRIPT>a</SCRipt",
 196                 "<script>$a&lt;/SCRipt",
 197         },
 198         {
 199                 "scriptx end tag",
 200                 "<SCRIPT>a</SCRiptx",
 201                 "<script>$a&lt;/SCRiptx",
 202         },
 203         {
 204                 "' ' completes script end tag",
 205                 "<SCRIPT>a</SCRipt ",
 206                 "<script>$a$</script>",
 207         },
 208         {
 209                 "'>' completes script end tag",
 210                 "<SCRIPT>a</SCRipt>",
 211                 "<script>$a$</script>",
 212         },
 213         {
 214                 "self-closing script end tag",
 215                 "<SCRIPT>a</SCRipt/>",
 216                 "<script>$a$</script>",
 217         },
 218         {
 219                 "nested script tag",
 220                 "<SCRIPT>a</SCRipt<script>",
 221                 "<script>$a&lt;/SCRipt&lt;script&gt;",
 222         },
 223         {
 224                 "script end tag after unfinished",
 225                 "<SCRIPT>a</SCRipt</script>",
 226                 "<script>$a&lt;/SCRipt$</script>",
 227         },
 228         {
 229                 "script/style mismatched tags",
 230                 "<script>a</style>",
 231                 "<script>$a&lt;/style&gt;",
 232         },
 233         {
 234                 "style element with entity",
 235                 "<style>&apos;",
 236                 "<style>$&amp;apos;",
 237         },
 238         {
 239                 "textarea with tag",
 240                 "<textarea><div></textarea>",
 241                 "<textarea>$&lt;div&gt;$</textarea>",
 242         },
 243         {
 244                 "title with tag and entity",
 245                 "<title><b>K&amp;R C</b></title>",
 246                 "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
 247         },
 248         // DOCTYPE tests.
 249         {
 250                 "Proper DOCTYPE",
 251                 "<!DOCTYPE html>",
 252                 "<!DOCTYPE html>",
 253         },
 254         {
 255                 "DOCTYPE with no space",
 256                 "<!doctypehtml>",
 257                 "<!DOCTYPE html>",
 258         },
 259         {
 260                 "DOCTYPE with two spaces",
 261                 "<!doctype  html>",
 262                 "<!DOCTYPE html>",
 263         },
 264         {
 265                 "looks like DOCTYPE but isn't",
 266                 "<!DOCUMENT html>",
 267                 "<!--DOCUMENT html-->",
 268         },
 269         {
 270                 "DOCTYPE at EOF",
 271                 "<!DOCtype",
 272                 "<!DOCTYPE >",
 273         },
 274         // XML processing instructions.
 275         {
 276                 "XML processing instruction",
 277                 "<?xml?>",
 278                 "<!--?xml?-->",
 279         },
 280         // Comments.
 281         {
 282                 "comment0",
 283                 "abc<b><!-- skipme --></b>def",
 284                 "abc$<b>$<!-- skipme -->$</b>$def",
 285         },
 286         {
 287                 "comment1",
 288                 "a<!-->z",
 289                 "a$<!---->$z",
 290         },
 291         {
 292                 "comment2",
 293                 "a<!--->z",
 294                 "a$<!---->$z",
 295         },
 296         {
 297                 "comment3",
 298                 "a<!--x>-->z",
 299                 "a$<!--x>-->$z",
 300         },
 301         {
 302                 "comment4",
 303                 "a<!--x->-->z",
 304                 "a$<!--x->-->$z",
 305         },
 306         {
 307                 "comment5",
 308                 "a<!>z",
 309                 "a$<!---->$z",
 310         },
 311         {
 312                 "comment6",
 313                 "a<!->z",
 314                 "a$<!----->$z",
 315         },
 316         {
 317                 "comment7",
 318                 "a<!---<>z",
 319                 "a$<!---<>z-->",
 320         },
 321         {
 322                 "comment8",
 323                 "a<!--z",
 324                 "a$<!--z-->",
 325         },
 326         {
 327                 "comment9",
 328                 "a<!--x--!>z",
 329                 "a$<!--x-->$z",
 330         },
 331         // An attribute with a backslash.
 332         {
 333                 "backslash",
 334                 `<p id="a\"b">`,
 335                 `<p id="a\" b"="">`,
 336         },
 337         // Entities, tag name and attribute key lower-casing, and whitespace
 338         // normalization within a tag.
 339         {
 340                 "tricky",
 341                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
 342                 `<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
 343         },
 344         // A nonexistent entity. Tokenizing and converting back to a string should
 345         // escape the "&" to become "&amp;".
 346         {
 347                 "noSuchEntity",
 348                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
 349                 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
 350         },
 351         /*
 352                 // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
 353                 {
 354                         "entity without semicolon",
 355                         `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
 356                         `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
 357                 },
 358         */
 359         {
 360                 "entity with digits",
 361                 "&frac12;",
 362                 "½",
 363         },
 364         // Attribute tests:
 365         // http://dev.w3.org/html5/spec/Overview.html#attributes-0
 366         {
 367                 "Empty attribute",
 368                 `<input disabled FOO>`,
 369                 `<input disabled="" foo="">`,
 370         },
 371         {
 372                 "Empty attribute, whitespace",
 373                 `<input disabled FOO >`,
 374                 `<input disabled="" foo="">`,
 375         },
 376         {
 377                 "Unquoted attribute value",
 378                 `<input value=yes FOO=BAR>`,
 379                 `<input value="yes" foo="BAR">`,
 380         },
 381         {
 382                 "Unquoted attribute value, spaces",
 383                 `<input value = yes FOO = BAR>`,
 384                 `<input value="yes" foo="BAR">`,
 385         },
 386         {
 387                 "Unquoted attribute value, trailing space",
 388                 `<input value=yes FOO=BAR >`,
 389                 `<input value="yes" foo="BAR">`,
 390         },
 391         {
 392                 "Single-quoted attribute value",
 393                 `<input value='yes' FOO='BAR'>`,
 394                 `<input value="yes" foo="BAR">`,
 395         },
 396         {
 397                 "Single-quoted attribute value, trailing space",
 398                 `<input value='yes' FOO='BAR' >`,
 399                 `<input value="yes" foo="BAR">`,
 400         },
 401         {
 402                 "Double-quoted attribute value",
 403                 `<input value="I'm an attribute" FOO="BAR">`,
 404                 `<input value="I&apos;m an attribute" foo="BAR">`,
 405         },
 406         {
 407                 "Attribute name characters",
 408                 `<meta http-equiv="content-type">`,
 409                 `<meta http-equiv="content-type">`,
 410         },
 411         {
 412                 "Mixed attributes",
 413                 `a<P V="0 1" w='2' X=3 y>z`,
 414                 `a$<p v="0 1" w="2" x="3" y="">$z`,
 415         },
 416         {
 417                 "Attributes with a solitary single quote",
 418                 `<p id=can't><p id=won't>`,
 419                 `<p id="can&apos;t">$<p id="won&apos;t">`,
 420         },
 421 }
 422
 423 func TestTokenizer(t *testing.T) {
 424 loop:
 425         for _, tt := range tokenTests {
 426                 z := NewTokenizer(strings.NewReader(tt.html))
 427                 if tt.golden != "" {
 428                         for i, s := range strings.Split(tt.golden, "$") {
 429                                 if z.Next() == ErrorToken {
 430                                         t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
 431                                         continue loop
 432                                 }
 433                                 actual := z.Token().String()
 434                                 if s != actual {
 435                                         t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
 436                                         continue loop
 437                                 }
 438                         }
 439                 }
 440                 z.Next()
 441                 if z.Error() != os.EOF {
 442                         t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
 443                 }
 444         }
 445 }
 446
 447 type unescapeTest struct {
 448         // A short description of the test case.
 449         desc string
 450         // The HTML text.
 451         html string
 452         // The unescaped text.
 453         unescaped string
 454 }
 455
 456 var unescapeTests = []unescapeTest{
 457         // Handle no entities.
 458         {
 459                 "copy",
 460                 "A\ttext\nstring",
 461                 "A\ttext\nstring",
 462         },
 463         // Handle simple named entities.
 464         {
 465                 "simple",
 466                 "&amp; &gt; &lt;",
 467                 "& > <",
 468         },
 469         // Handle hitting the end of the string.
 470         {
 471                 "stringEnd",
 472                 "&amp &amp",
 473                 "& &",
 474         },
 475         // Handle entities with two codepoints.
 476         {
 477                 "multiCodepoint",
 478                 "text &gesl; blah",
 479                 "text \u22db\ufe00 blah",
 480         },
 481         // Handle decimal numeric entities.
 482         {
 483                 "decimalEntity",
 484                 "Delta = &#916; ",
 485                 "Delta = Δ ",
 486         },
 487         // Handle hexadecimal numeric entities.
 488         {
 489                 "hexadecimalEntity",
 490                 "Lambda = &#x3bb; = &#X3Bb ",
 491                 "Lambda = λ = λ ",
 492         },
 493         // Handle numeric early termination.
 494         {
 495                 "numericEnds",
 496                 "&# &#x &#128;43 &copy = &#169f = &#xa9",
 497                 "&# &#x €43 © = ©f = ©",
 498         },
 499         // Handle numeric ISO-8859-1 entity replacements.
 500         {
 501                 "numericReplacements",
 502                 "Footnote&#x87;",
 503                 "Footnote‡",
 504         },
 505 }
 506
 507 func TestUnescape(t *testing.T) {
 508         for _, tt := range unescapeTests {
 509                 unescaped := UnescapeString(tt.html)
 510                 if unescaped != tt.unescaped {
 511                         t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
 512                 }
 513         }
 514 }
 515
 516 func TestUnescapeEscape(t *testing.T) {
 517         ss := []string{
 518                 ``,
 519                 `abc def`,
 520                 `a & b`,
 521                 `a&amp;b`,
 522                 `a &amp b`,
 523                 `&quot;`,
 524                 `"`,
 525                 `"<&>"`,
 526                 `&quot;&lt;&amp;&gt;&quot;`,
 527                 `3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
 528         }
 529         for _, s := range ss {
 530                 if s != UnescapeString(EscapeString(s)) {
 531                         t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
 532                 }
 533         }
 534 }
 535
 536 func TestBufAPI(t *testing.T) {
 537         s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
 538         z := NewTokenizer(bytes.NewBuffer([]byte(s)))
 539         result := bytes.NewBuffer(nil)
 540         depth := 0
 541 loop:
 542         for {
 543                 tt := z.Next()
 544                 switch tt {
 545                 case ErrorToken:
 546                         if z.Error() != os.EOF {
 547                                 t.Error(z.Error())
 548                         }
 549                         break loop
 550                 case TextToken:
 551                         if depth > 0 {
 552                                 result.Write(z.Text())
 553                         }
 554                 case StartTagToken, EndTagToken:
 555                         tn, _ := z.TagName()
 556                         if len(tn) == 1 && tn[0] == 'a' {
 557                                 if tt == StartTagToken {
 558                                         depth++
 559                                 } else {
 560                                         depth--
 561                                 }
 562                         }
 563                 }
 564         }
 565         u := "14567"
 566         v := string(result.Bytes())
 567         if u != v {
 568                 t.Errorf("TestBufAPI: want %q got %q", u, v)
 569         }
 570 }