1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 type tokenTest struct {
15 // A short description of the test case.
19 // The string representations of the expected tokens, joined by '$'.
23 var tokenTests = []tokenTest{
29 // A single text node. The tokenizer should not break text nodes on whitespace,
30 // nor should it normalize whitespace within a text node.
42 // A start, self-closing and end tag. The tokenizer does not care if the start
43 // and end tokens don't match; that is the job of the parser.
49 // Angle brackets that aren't a tag.
98 "a<<$<b>$>>c",
102 "if x<0 and y < 0 then x*y>0",
103 "if x<0 and y < 0 then x*y>0",
105 // EOF in a tag name.
131 // Some malformed tags that are missing a '>'.
170 `<p id="0</p>">`,
175 `<p id="0" <="" p="">`,
177 // Raw text and RCDATA.
180 "<script><a></b></script>",
181 "<script>$<a></b>$</script>",
184 "unfinished script end tag",
186 "<script>$a</SCR",
189 "broken script end tag",
190 "<SCRIPT>a</SCR ipt>",
191 "<script>$a</SCR ipt>",
194 "EOF in script end tag",
196 "<script>$a</SCRipt",
200 "<SCRIPT>a</SCRiptx",
201 "<script>$a</SCRiptx",
204 "' ' completes script end tag",
205 "<SCRIPT>a</SCRipt ",
206 "<script>$a$</script>",
209 "'>' completes script end tag",
210 "<SCRIPT>a</SCRipt>",
211 "<script>$a$</script>",
214 "self-closing script end tag",
215 "<SCRIPT>a</SCRipt/>",
216 "<script>$a$</script>",
220 "<SCRIPT>a</SCRipt<script>",
221 "<script>$a</SCRipt<script>",
224 "script end tag after unfinished",
225 "<SCRIPT>a</SCRipt</script>",
226 "<script>$a</SCRipt$</script>",
229 "script/style mismatched tags",
231 "<script>$a</style>",
234 "style element with entity",
236 "<style>$&apos;",
240 "<textarea><div></textarea>",
241 "<textarea>$<div>$</textarea>",
244 "title with tag and entity",
245 "<title><b>K&R C</b></title>",
246 "<title>$<b>K&R C</b>$</title>",
255 "DOCTYPE with no space",
260 "DOCTYPE with two spaces",
265 "looks like DOCTYPE but isn't",
267 "<!--DOCUMENT html-->",
274 // XML processing instructions.
276 "XML processing instruction",
283 "abc<b><!-- skipme --></b>def",
284 "abc$<b>$<!-- skipme -->$</b>$def",
331 // An attribute with a backslash.
337 // Entities, tag name and attribute key lower-casing, and whitespace
338 // normalization within a tag.
341 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
342 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`,
344 // A nonexistent entity. Tokenizing and converting back to a string should
345 // escape the "&" to become "&".
348 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
349 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
352 // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
354 "entity without semicolon",
355 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`,
356 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`,
360 "entity with digits",
365 // http://dev.w3.org/html5/spec/Overview.html#attributes-0
368 `<input disabled FOO>`,
369 `<input disabled="" foo="">`,
372 "Empty attribute, whitespace",
373 `<input disabled FOO >`,
374 `<input disabled="" foo="">`,
377 "Unquoted attribute value",
378 `<input value=yes FOO=BAR>`,
379 `<input value="yes" foo="BAR">`,
382 "Unquoted attribute value, spaces",
383 `<input value = yes FOO = BAR>`,
384 `<input value="yes" foo="BAR">`,
387 "Unquoted attribute value, trailing space",
388 `<input value=yes FOO=BAR >`,
389 `<input value="yes" foo="BAR">`,
392 "Single-quoted attribute value",
393 `<input value='yes' FOO='BAR'>`,
394 `<input value="yes" foo="BAR">`,
397 "Single-quoted attribute value, trailing space",
398 `<input value='yes' FOO='BAR' >`,
399 `<input value="yes" foo="BAR">`,
402 "Double-quoted attribute value",
403 `<input value="I'm an attribute" FOO="BAR">`,
404 `<input value="I'm an attribute" foo="BAR">`,
407 "Attribute name characters",
408 `<meta http-equiv="content-type">`,
409 `<meta http-equiv="content-type">`,
413 `a<P V="0 1" w='2' X=3 y>z`,
414 `a$<p v="0 1" w="2" x="3" y="">$z`,
417 "Attributes with a solitary single quote",
418 `<p id=can't><p id=won't>`,
419 `<p id="can't">$<p id="won't">`,
423 func TestTokenizer(t *testing.T) {
425 for _, tt := range tokenTests {
426 z := NewTokenizer(strings.NewReader(tt.html))
428 for i, s := range strings.Split(tt.golden, "$") {
429 if z.Next() == ErrorToken {
430 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
433 actual := z.Token().String()
435 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
441 if z.Error() != os.EOF {
442 t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
447 type unescapeTest struct {
448 // A short description of the test case.
452 // The unescaped text.
456 var unescapeTests = []unescapeTest{
457 // Handle no entities.
463 // Handle simple named entities.
469 // Handle hitting the end of the string.
475 // Handle entities with two codepoints.
479 "text \u22db\ufe00 blah",
481 // Handle decimal numeric entities.
487 // Handle hexadecimal numeric entities.
490 "Lambda = λ = λ ",
493 // Handle numeric early termination.
496 "&# &#x €43 © = ©f = ©",
497 "&# &#x €43 © = ©f = ©",
499 // Handle numeric ISO-8859-1 entity replacements.
501 "numericReplacements",
507 func TestUnescape(t *testing.T) {
508 for _, tt := range unescapeTests {
509 unescaped := UnescapeString(tt.html)
510 if unescaped != tt.unescaped {
511 t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
516 func TestUnescapeEscape(t *testing.T) {
526 `"<&>"`,
527 `3&5==1 && 0<1, "0<1", a+acute=á`,
529 for _, s := range ss {
530 if s != UnescapeString(EscapeString(s)) {
531 t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
536 func TestBufAPI(t *testing.T) {
537 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
538 z := NewTokenizer(bytes.NewBuffer([]byte(s)))
539 result := bytes.NewBuffer(nil)
546 if z.Error() != os.EOF {
552 result.Write(z.Text())
554 case StartTagToken, EndTagToken:
556 if len(tn) == 1 && tn[0] == 'a' {
557 if tt == StartTagToken {
566 v := string(result.Bytes())
568 t.Errorf("TestBufAPI: want %q got %q", u, v)