OSDN Git Service

libgo: update to weekly.2011-10-25
[pf3gnuchains/gcc-fork.git] / libgo / go / html / token_test.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8         "bytes"
9         "os"
10         "strings"
11         "testing"
12 )
13
14 type tokenTest struct {
15         // A short description of the test case.
16         desc string
17         // The HTML to parse.
18         html string
19         // The string representations of the expected tokens, joined by '$'.
20         golden string
21 }
22
23 var tokenTests = []tokenTest{
24         {
25                 "empty",
26                 "",
27                 "",
28         },
29         // A single text node. The tokenizer should not break text nodes on whitespace,
30         // nor should it normalize whitespace within a text node.
31         {
32                 "text",
33                 "foo  bar",
34                 "foo  bar",
35         },
36         // An entity.
37         {
38                 "entity",
39                 "one < two",
40                 "one < two",
41         },
42         // A start, self-closing and end tag. The tokenizer does not care if the start
43         // and end tokens don't match; that is the job of the parser.
44         {
45                 "tags",
46                 "<a>b<c/>d</e>",
47                 "<a>$b$<c/>$d$</e>",
48         },
49         // Angle brackets that aren't a tag.
50         {
51                 "not a tag #0",
52                 "<",
53                 "&lt;",
54         },
55         {
56                 "not a tag #1",
57                 "</",
58                 "&lt;/",
59         },
60         {
61                 "not a tag #2",
62                 "</>",
63                 "",
64         },
65         {
66                 "not a tag #3",
67                 "a</>b",
68                 "a$b",
69         },
70         {
71                 "not a tag #4",
72                 "</ >",
73                 "<!-- -->",
74         },
75         {
76                 "not a tag #5",
77                 "</.",
78                 "<!--.-->",
79         },
80         {
81                 "not a tag #6",
82                 "</.>",
83                 "<!--.-->",
84         },
85         {
86                 "not a tag #7",
87                 "a < b",
88                 "a &lt; b",
89         },
90         {
91                 "not a tag #8",
92                 "<.>",
93                 "&lt;.&gt;",
94         },
95         {
96                 "not a tag #9",
97                 "a<<<b>>>c",
98                 "a&lt;&lt;$<b>$&gt;&gt;c",
99         },
100         {
101                 "not a tag #10",
102                 "if x<0 and y < 0 then x*y>0",
103                 "if x&lt;0 and y &lt; 0 then x*y&gt;0",
104         },
105         // EOF in a tag name.
106         {
107                 "tag name eof #0",
108                 "<a",
109                 "",
110         },
111         {
112                 "tag name eof #1",
113                 "<a ",
114                 "",
115         },
116         {
117                 "tag name eof #2",
118                 "a<b",
119                 "a",
120         },
121         {
122                 "tag name eof #3",
123                 "<a><b",
124                 "<a>",
125         },
126         {
127                 "tag name eof #4",
128                 `<a x`,
129                 `<a x="">`,
130         },
131         // Some malformed tags that are missing a '>'.
132         {
133                 "malformed tag #0",
134                 `<p</p>`,
135                 `<p< p="">`,
136         },
137         {
138                 "malformed tag #1",
139                 `<p </p>`,
140                 `<p <="" p="">`,
141         },
142         {
143                 "malformed tag #2",
144                 `<p id`,
145                 `<p id="">`,
146         },
147         {
148                 "malformed tag #3",
149                 `<p id=`,
150                 `<p id="">`,
151         },
152         {
153                 "malformed tag #4",
154                 `<p id=>`,
155                 `<p id="">`,
156         },
157         {
158                 "malformed tag #5",
159                 `<p id=0`,
160                 `<p id="0">`,
161         },
162         {
163                 "malformed tag #6",
164                 `<p id=0</p>`,
165                 `<p id="0&lt;/p">`,
166         },
167         {
168                 "malformed tag #7",
169                 `<p id="0</p>`,
170                 `<p id="0&lt;/p&gt;">`,
171         },
172         {
173                 "malformed tag #8",
174                 `<p id="0"</p>`,
175                 `<p id="0" <="" p="">`,
176         },
177         // Raw text and RCDATA.
178         {
179                 "basic raw text",
180                 "<script><a></b></script>",
181                 "<script>$&lt;a&gt;&lt;/b&gt;$</script>",
182         },
183         {
184                 "unfinished script end tag",
185                 "<SCRIPT>a</SCR",
186                 "<script>$a&lt;/SCR",
187         },
188         {
189                 "broken script end tag",
190                 "<SCRIPT>a</SCR ipt>",
191                 "<script>$a&lt;/SCR ipt&gt;",
192         },
193         {
194                 "EOF in script end tag",
195                 "<SCRIPT>a</SCRipt",
196                 "<script>$a&lt;/SCRipt",
197         },
198         {
199                 "scriptx end tag",
200                 "<SCRIPT>a</SCRiptx",
201                 "<script>$a&lt;/SCRiptx",
202         },
203         {
204                 "' ' completes script end tag",
205                 "<SCRIPT>a</SCRipt ",
206                 "<script>$a$</script>",
207         },
208         {
209                 "'>' completes script end tag",
210                 "<SCRIPT>a</SCRipt>",
211                 "<script>$a$</script>",
212         },
213         {
214                 "self-closing script end tag",
215                 "<SCRIPT>a</SCRipt/>",
216                 "<script>$a$</script>",
217         },
218         {
219                 "nested script tag",
220                 "<SCRIPT>a</SCRipt<script>",
221                 "<script>$a&lt;/SCRipt&lt;script&gt;",
222         },
223         {
224                 "script end tag after unfinished",
225                 "<SCRIPT>a</SCRipt</script>",
226                 "<script>$a&lt;/SCRipt$</script>",
227         },
228         {
229                 "script/style mismatched tags",
230                 "<script>a</style>",
231                 "<script>$a&lt;/style&gt;",
232         },
233         {
234                 "style element with entity",
235                 "<style>&apos;",
236                 "<style>$&amp;apos;",
237         },
238         {
239                 "textarea with tag",
240                 "<textarea><div></textarea>",
241                 "<textarea>$&lt;div&gt;$</textarea>",
242         },
243         {
244                 "title with tag and entity",
245                 "<title><b>K&amp;R C</b></title>",
246                 "<title>$&lt;b&gt;K&amp;R C&lt;/b&gt;$</title>",
247         },
248         // DOCTYPE tests.
249         {
250                 "Proper DOCTYPE",
251                 "<!DOCTYPE html>",
252                 "<!DOCTYPE html>",
253         },
254         {
255                 "DOCTYPE with no space",
256                 "<!doctypehtml>",
257                 "<!DOCTYPE html>",
258         },
259         {
260                 "DOCTYPE with two spaces",
261                 "<!doctype  html>",
262                 "<!DOCTYPE html>",
263         },
264         {
265                 "looks like DOCTYPE but isn't",
266                 "<!DOCUMENT html>",
267                 "<!--DOCUMENT html-->",
268         },
269         {
270                 "DOCTYPE at EOF",
271                 "<!DOCtype",
272                 "<!DOCTYPE >",
273         },
274         // XML processing instructions.
275         {
276                 "XML processing instruction",
277                 "<?xml?>",
278                 "<!--?xml?-->",
279         },
280         // Comments.
281         {
282                 "comment0",
283                 "abc<b><!-- skipme --></b>def",
284                 "abc$<b>$<!-- skipme -->$</b>$def",
285         },
286         {
287                 "comment1",
288                 "a<!-->z",
289                 "a$<!---->$z",
290         },
291         {
292                 "comment2",
293                 "a<!--->z",
294                 "a$<!---->$z",
295         },
296         {
297                 "comment3",
298                 "a<!--x>-->z",
299                 "a$<!--x>-->$z",
300         },
301         {
302                 "comment4",
303                 "a<!--x->-->z",
304                 "a$<!--x->-->$z",
305         },
306         {
307                 "comment5",
308                 "a<!>z",
309                 "a$<!---->$z",
310         },
311         {
312                 "comment6",
313                 "a<!->z",
314                 "a$<!----->$z",
315         },
316         {
317                 "comment7",
318                 "a<!---<>z",
319                 "a$<!---<>z-->",
320         },
321         {
322                 "comment8",
323                 "a<!--z",
324                 "a$<!--z-->",
325         },
326         {
327                 "comment9",
328                 "a<!--x--!>z",
329                 "a$<!--x-->$z",
330         },
331         // An attribute with a backslash.
332         {
333                 "backslash",
334                 `<p id="a\"b">`,
335                 `<p id="a\" b"="">`,
336         },
337         // Entities, tag name and attribute key lower-casing, and whitespace
338         // normalization within a tag.
339         {
340                 "tricky",
341                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
342                 `<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
343         },
344         // A nonexistent entity. Tokenizing and converting back to a string should
345         // escape the "&" to become "&amp;".
346         {
347                 "noSuchEntity",
348                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
349                 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
350         },
351         /*
352                 // TODO: re-enable this test when it works. This input/output matches html5lib's behavior.
353                 {
354                         "entity without semicolon",
355                         `&notit;&notin;<a b="q=z&amp=5&notice=hello&not;=world">`,
356                         `¬it;∉$<a b="q=z&amp;amp=5&amp;notice=hello¬=world">`,
357                 },
358         */
359         {
360                 "entity with digits",
361                 "&frac12;",
362                 "½",
363         },
364         // Attribute tests:
365         // http://dev.w3.org/html5/spec/Overview.html#attributes-0
366         {
367                 "Empty attribute",
368                 `<input disabled FOO>`,
369                 `<input disabled="" foo="">`,
370         },
371         {
372                 "Empty attribute, whitespace",
373                 `<input disabled FOO >`,
374                 `<input disabled="" foo="">`,
375         },
376         {
377                 "Unquoted attribute value",
378                 `<input value=yes FOO=BAR>`,
379                 `<input value="yes" foo="BAR">`,
380         },
381         {
382                 "Unquoted attribute value, spaces",
383                 `<input value = yes FOO = BAR>`,
384                 `<input value="yes" foo="BAR">`,
385         },
386         {
387                 "Unquoted attribute value, trailing space",
388                 `<input value=yes FOO=BAR >`,
389                 `<input value="yes" foo="BAR">`,
390         },
391         {
392                 "Single-quoted attribute value",
393                 `<input value='yes' FOO='BAR'>`,
394                 `<input value="yes" foo="BAR">`,
395         },
396         {
397                 "Single-quoted attribute value, trailing space",
398                 `<input value='yes' FOO='BAR' >`,
399                 `<input value="yes" foo="BAR">`,
400         },
401         {
402                 "Double-quoted attribute value",
403                 `<input value="I'm an attribute" FOO="BAR">`,
404                 `<input value="I&apos;m an attribute" foo="BAR">`,
405         },
406         {
407                 "Attribute name characters",
408                 `<meta http-equiv="content-type">`,
409                 `<meta http-equiv="content-type">`,
410         },
411         {
412                 "Mixed attributes",
413                 `a<P V="0 1" w='2' X=3 y>z`,
414                 `a$<p v="0 1" w="2" x="3" y="">$z`,
415         },
416         {
417                 "Attributes with a solitary single quote",
418                 `<p id=can't><p id=won't>`,
419                 `<p id="can&apos;t">$<p id="won&apos;t">`,
420         },
421 }
422
423 func TestTokenizer(t *testing.T) {
424 loop:
425         for _, tt := range tokenTests {
426                 z := NewTokenizer(strings.NewReader(tt.html))
427                 if tt.golden != "" {
428                         for i, s := range strings.Split(tt.golden, "$") {
429                                 if z.Next() == ErrorToken {
430                                         t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
431                                         continue loop
432                                 }
433                                 actual := z.Token().String()
434                                 if s != actual {
435                                         t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
436                                         continue loop
437                                 }
438                         }
439                 }
440                 z.Next()
441                 if z.Error() != os.EOF {
442                         t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
443                 }
444         }
445 }
446
447 type unescapeTest struct {
448         // A short description of the test case.
449         desc string
450         // The HTML text.
451         html string
452         // The unescaped text.
453         unescaped string
454 }
455
456 var unescapeTests = []unescapeTest{
457         // Handle no entities.
458         {
459                 "copy",
460                 "A\ttext\nstring",
461                 "A\ttext\nstring",
462         },
463         // Handle simple named entities.
464         {
465                 "simple",
466                 "&amp; &gt; &lt;",
467                 "& > <",
468         },
469         // Handle hitting the end of the string.
470         {
471                 "stringEnd",
472                 "&amp &amp",
473                 "& &",
474         },
475         // Handle entities with two codepoints.
476         {
477                 "multiCodepoint",
478                 "text &gesl; blah",
479                 "text \u22db\ufe00 blah",
480         },
481         // Handle decimal numeric entities.
482         {
483                 "decimalEntity",
484                 "Delta = &#916; ",
485                 "Delta = Δ ",
486         },
487         // Handle hexadecimal numeric entities.
488         {
489                 "hexadecimalEntity",
490                 "Lambda = &#x3bb; = &#X3Bb ",
491                 "Lambda = λ = λ ",
492         },
493         // Handle numeric early termination.
494         {
495                 "numericEnds",
496                 "&# &#x &#128;43 &copy = &#169f = &#xa9",
497                 "&# &#x €43 © = ©f = ©",
498         },
499         // Handle numeric ISO-8859-1 entity replacements.
500         {
501                 "numericReplacements",
502                 "Footnote&#x87;",
503                 "Footnote‡",
504         },
505 }
506
507 func TestUnescape(t *testing.T) {
508         for _, tt := range unescapeTests {
509                 unescaped := UnescapeString(tt.html)
510                 if unescaped != tt.unescaped {
511                         t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
512                 }
513         }
514 }
515
516 func TestUnescapeEscape(t *testing.T) {
517         ss := []string{
518                 ``,
519                 `abc def`,
520                 `a & b`,
521                 `a&amp;b`,
522                 `a &amp b`,
523                 `&quot;`,
524                 `"`,
525                 `"<&>"`,
526                 `&quot;&lt;&amp;&gt;&quot;`,
527                 `3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
528         }
529         for _, s := range ss {
530                 if s != UnescapeString(EscapeString(s)) {
531                         t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
532                 }
533         }
534 }
535
536 func TestBufAPI(t *testing.T) {
537         s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
538         z := NewTokenizer(bytes.NewBuffer([]byte(s)))
539         result := bytes.NewBuffer(nil)
540         depth := 0
541 loop:
542         for {
543                 tt := z.Next()
544                 switch tt {
545                 case ErrorToken:
546                         if z.Error() != os.EOF {
547                                 t.Error(z.Error())
548                         }
549                         break loop
550                 case TextToken:
551                         if depth > 0 {
552                                 result.Write(z.Text())
553                         }
554                 case StartTagToken, EndTagToken:
555                         tn, _ := z.TagName()
556                         if len(tn) == 1 && tn[0] == 'a' {
557                                 if tt == StartTagToken {
558                                         depth++
559                                 } else {
560                                         depth--
561                                 }
562                         }
563                 }
564         }
565         u := "14567"
566         v := string(result.Bytes())
567         if u != v {
568                 t.Errorf("TestBufAPI: want %q got %q", u, v)
569         }
570 }