1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
16 <?xml version="1.0" encoding="UTF-8"?>
17 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
18 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
19 <body xmlns:foo="ns1" xmlns="ns2" xmlns:tag="ns3" ` +
21 <hello lang="en">World <>'" 白鵬翔</hello>
23 <outer foo:attr="value" xmlns:tag="ns4">
27 <![CDATA[Some text here.]]>
29 </body><!-- missing final newline -->`
31 var rawTokens = []Token{
32 CharData([]byte("\n")),
33 ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)},
34 CharData([]byte("\n")),
35 Directive([]byte(`DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
36 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"`),
38 CharData([]byte("\n")),
39 StartElement{Name{"", "body"}, []Attr{{Name{"xmlns", "foo"}, "ns1"}, {Name{"", "xmlns"}, "ns2"}, {Name{"xmlns", "tag"}, "ns3"}}},
40 CharData([]byte("\n ")),
41 StartElement{Name{"", "hello"}, []Attr{{Name{"", "lang"}, "en"}}},
42 CharData([]byte("World <>'\" 白鵬翔")),
43 EndElement{Name{"", "hello"}},
44 CharData([]byte("\n ")),
45 StartElement{Name{"", "goodbye"}, []Attr{}},
46 EndElement{Name{"", "goodbye"}},
47 CharData([]byte("\n ")),
48 StartElement{Name{"", "outer"}, []Attr{{Name{"foo", "attr"}, "value"}, {Name{"xmlns", "tag"}, "ns4"}}},
49 CharData([]byte("\n ")),
50 StartElement{Name{"", "inner"}, []Attr{}},
51 EndElement{Name{"", "inner"}},
52 CharData([]byte("\n ")),
53 EndElement{Name{"", "outer"}},
54 CharData([]byte("\n ")),
55 StartElement{Name{"tag", "name"}, []Attr{}},
56 CharData([]byte("\n ")),
57 CharData([]byte("Some text here.")),
58 CharData([]byte("\n ")),
59 EndElement{Name{"tag", "name"}},
60 CharData([]byte("\n")),
61 EndElement{Name{"", "body"}},
62 Comment([]byte(" missing final newline ")),
65 var cookedTokens = []Token{
66 CharData([]byte("\n")),
67 ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)},
68 CharData([]byte("\n")),
69 Directive([]byte(`DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
70 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"`),
72 CharData([]byte("\n")),
73 StartElement{Name{"ns2", "body"}, []Attr{{Name{"xmlns", "foo"}, "ns1"}, {Name{"", "xmlns"}, "ns2"}, {Name{"xmlns", "tag"}, "ns3"}}},
74 CharData([]byte("\n ")),
75 StartElement{Name{"ns2", "hello"}, []Attr{{Name{"", "lang"}, "en"}}},
76 CharData([]byte("World <>'\" 白鵬翔")),
77 EndElement{Name{"ns2", "hello"}},
78 CharData([]byte("\n ")),
79 StartElement{Name{"ns2", "goodbye"}, []Attr{}},
80 EndElement{Name{"ns2", "goodbye"}},
81 CharData([]byte("\n ")),
82 StartElement{Name{"ns2", "outer"}, []Attr{{Name{"ns1", "attr"}, "value"}, {Name{"xmlns", "tag"}, "ns4"}}},
83 CharData([]byte("\n ")),
84 StartElement{Name{"ns2", "inner"}, []Attr{}},
85 EndElement{Name{"ns2", "inner"}},
86 CharData([]byte("\n ")),
87 EndElement{Name{"ns2", "outer"}},
88 CharData([]byte("\n ")),
89 StartElement{Name{"ns3", "name"}, []Attr{}},
90 CharData([]byte("\n ")),
91 CharData([]byte("Some text here.")),
92 CharData([]byte("\n ")),
93 EndElement{Name{"ns3", "name"}},
94 CharData([]byte("\n")),
95 EndElement{Name{"ns2", "body"}},
96 Comment([]byte(" missing final newline ")),
99 const testInputAltEncoding = `
100 <?xml version="1.0" encoding="x-testing-uppercase"?>
103 var rawTokensAltEncoding = []Token{
104 CharData([]byte("\n")),
105 ProcInst{"xml", []byte(`version="1.0" encoding="x-testing-uppercase"`)},
106 CharData([]byte("\n")),
107 StartElement{Name{"", "tag"}, []Attr{}},
108 CharData([]byte("value")),
109 EndElement{Name{"", "tag"}},
112 var xmlInput = []string{
113 // unexpected EOF cases
138 // other Syntax errors
143 // "<!0 >", // let the Token() caller handle
152 // "<![CDATA[d]]>", // let the Token() caller handle
159 type stringReader struct {
164 func (r *stringReader) Read(b []byte) (n int, err error) {
165 if r.off >= len(r.s) {
168 for r.off < len(r.s) && n < len(b) {
176 func (r *stringReader) ReadByte() (b byte, err error) {
177 if r.off >= len(r.s) {
185 func StringReader(s string) io.Reader { return &stringReader{s, 0} }
187 func TestRawToken(t *testing.T) {
188 p := NewParser(StringReader(testInput))
189 testRawToken(t, p, rawTokens)
192 type downCaser struct {
197 func (d *downCaser) ReadByte() (c byte, err error) {
198 c, err = d.r.ReadByte()
199 if c >= 'A' && c <= 'Z' {
205 func (d *downCaser) Read(p []byte) (int, error) {
206 d.t.Fatalf("unexpected Read call on downCaser reader")
210 func TestRawTokenAltEncoding(t *testing.T) {
212 p := NewParser(StringReader(testInputAltEncoding))
213 p.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
214 sawEncoding = charset
215 if charset != "x-testing-uppercase" {
216 t.Fatalf("unexpected charset %q", charset)
218 return &downCaser{t, input.(io.ByteReader)}, nil
220 testRawToken(t, p, rawTokensAltEncoding)
223 func TestRawTokenAltEncodingNoConverter(t *testing.T) {
224 p := NewParser(StringReader(testInputAltEncoding))
225 token, err := p.RawToken()
227 t.Fatalf("expected a token on first RawToken call")
232 token, err = p.RawToken()
234 t.Errorf("expected a nil token; got %#v", token)
237 t.Fatalf("expected an error on second RawToken call")
239 const encoding = "x-testing-uppercase"
240 if !strings.Contains(err.Error(), encoding) {
241 t.Errorf("expected error to contain %q; got error: %v",
246 func testRawToken(t *testing.T, p *Parser, rawTokens []Token) {
247 for i, want := range rawTokens {
248 have, err := p.RawToken()
250 t.Fatalf("token %d: unexpected error: %s", i, err)
252 if !reflect.DeepEqual(have, want) {
253 t.Errorf("token %d = %#v want %#v", i, have, want)
258 // Ensure that directives (specifically !DOCTYPE) include the complete
259 // text of any nested directives, noting that < and > do not change
260 // nesting depth if they are in single or double quotes.
262 var nestedDirectivesInput = `
263 <!DOCTYPE [<!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#">]>
264 <!DOCTYPE [<!ENTITY xlt ">">]>
265 <!DOCTYPE [<!ENTITY xlt "<">]>
266 <!DOCTYPE [<!ENTITY xlt '>'>]>
267 <!DOCTYPE [<!ENTITY xlt '<'>]>
268 <!DOCTYPE [<!ENTITY xlt '">'>]>
269 <!DOCTYPE [<!ENTITY xlt "'<">]>
272 var nestedDirectivesTokens = []Token{
273 CharData([]byte("\n")),
274 Directive([]byte(`DOCTYPE [<!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#">]`)),
275 CharData([]byte("\n")),
276 Directive([]byte(`DOCTYPE [<!ENTITY xlt ">">]`)),
277 CharData([]byte("\n")),
278 Directive([]byte(`DOCTYPE [<!ENTITY xlt "<">]`)),
279 CharData([]byte("\n")),
280 Directive([]byte(`DOCTYPE [<!ENTITY xlt '>'>]`)),
281 CharData([]byte("\n")),
282 Directive([]byte(`DOCTYPE [<!ENTITY xlt '<'>]`)),
283 CharData([]byte("\n")),
284 Directive([]byte(`DOCTYPE [<!ENTITY xlt '">'>]`)),
285 CharData([]byte("\n")),
286 Directive([]byte(`DOCTYPE [<!ENTITY xlt "'<">]`)),
287 CharData([]byte("\n")),
290 func TestNestedDirectives(t *testing.T) {
291 p := NewParser(StringReader(nestedDirectivesInput))
293 for i, want := range nestedDirectivesTokens {
294 have, err := p.Token()
296 t.Fatalf("token %d: unexpected error: %s", i, err)
298 if !reflect.DeepEqual(have, want) {
299 t.Errorf("token %d = %#v want %#v", i, have, want)
304 func TestToken(t *testing.T) {
305 p := NewParser(StringReader(testInput))
307 for i, want := range cookedTokens {
308 have, err := p.Token()
310 t.Fatalf("token %d: unexpected error: %s", i, err)
312 if !reflect.DeepEqual(have, want) {
313 t.Errorf("token %d = %#v want %#v", i, have, want)
318 func TestSyntax(t *testing.T) {
319 for i := range xmlInput {
320 p := NewParser(StringReader(xmlInput[i]))
322 for _, err = p.Token(); err == nil; _, err = p.Token() {
324 if _, ok := err.(*SyntaxError); !ok {
325 t.Fatalf(`xmlInput "%s": expected SyntaxError not received`, xmlInput[i])
330 type allScalars struct {
352 var all = allScalars{
376 const testScalarsInput = `<allscalars>
379 <false1>false</false1>
391 <uintptr>11</uintptr>
393 <float32>13.0</float32>
394 <float64>14.0</float64>
396 <ptrstring>16</ptrstring>
399 func TestAllScalars(t *testing.T) {
401 buf := bytes.NewBufferString(testScalarsInput)
402 err := Unmarshal(buf, &a)
407 if !reflect.DeepEqual(a, all) {
408 t.Errorf("have %+v want %+v", a, all)
416 func TestIssue569(t *testing.T) {
417 data := `<item><field_a>abcd</field_a></item>`
419 buf := bytes.NewBufferString(data)
420 err := Unmarshal(buf, &i)
422 if err != nil || i.Field_a != "abcd" {
423 t.Fatal("Expecting abcd")
427 func TestUnquotedAttrs(t *testing.T) {
428 data := "<tag attr=azAZ09:-_\t>"
429 p := NewParser(StringReader(data))
431 token, err := p.Token()
432 if _, ok := err.(*SyntaxError); ok {
433 t.Errorf("Unexpected error: %v", err)
435 if token.(StartElement).Name.Local != "tag" {
436 t.Errorf("Unexpected tag name: %v", token.(StartElement).Name.Local)
438 attr := token.(StartElement).Attr[0]
439 if attr.Value != "azAZ09:-_" {
440 t.Errorf("Unexpected attribute value: %v", attr.Value)
442 if attr.Name.Local != "attr" {
443 t.Errorf("Unexpected attribute name: %v", attr.Name.Local)
447 func TestValuelessAttrs(t *testing.T) {
448 tests := [][3]string{
449 {"<p nowrap>", "p", "nowrap"},
450 {"<p nowrap >", "p", "nowrap"},
451 {"<input checked/>", "input", "checked"},
452 {"<input checked />", "input", "checked"},
454 for _, test := range tests {
455 p := NewParser(StringReader(test[0]))
457 token, err := p.Token()
458 if _, ok := err.(*SyntaxError); ok {
459 t.Errorf("Unexpected error: %v", err)
461 if token.(StartElement).Name.Local != test[1] {
462 t.Errorf("Unexpected tag name: %v", token.(StartElement).Name.Local)
464 attr := token.(StartElement).Attr[0]
465 if attr.Value != test[2] {
466 t.Errorf("Unexpected attribute value: %v", attr.Value)
468 if attr.Name.Local != test[2] {
469 t.Errorf("Unexpected attribute name: %v", attr.Name.Local)
474 func TestCopyTokenCharData(t *testing.T) {
475 data := []byte("same data")
476 var tok1 Token = CharData(data)
477 tok2 := CopyToken(tok1)
478 if !reflect.DeepEqual(tok1, tok2) {
479 t.Error("CopyToken(CharData) != CharData")
482 if reflect.DeepEqual(tok1, tok2) {
483 t.Error("CopyToken(CharData) uses same buffer.")
487 func TestCopyTokenStartElement(t *testing.T) {
488 elt := StartElement{Name{"", "hello"}, []Attr{{Name{"", "lang"}, "en"}}}
490 tok2 := CopyToken(tok1)
491 if !reflect.DeepEqual(tok1, tok2) {
492 t.Error("CopyToken(StartElement) != StartElement")
494 elt.Attr[0] = Attr{Name{"", "lang"}, "de"}
495 if reflect.DeepEqual(tok1, tok2) {
496 t.Error("CopyToken(CharData) uses same buffer.")
500 func TestSyntaxErrorLineNum(t *testing.T) {
501 testInput := "<P>Foo<P>\n\n<P>Bar</>\n"
502 p := NewParser(StringReader(testInput))
504 for _, err = p.Token(); err == nil; _, err = p.Token() {
506 synerr, ok := err.(*SyntaxError)
508 t.Error("Expected SyntaxError.")
510 if synerr.Line != 3 {
511 t.Error("SyntaxError didn't have correct line number.")
515 func TestTrailingRawToken(t *testing.T) {
516 input := `<FOO></FOO> `
517 p := NewParser(StringReader(input))
519 for _, err = p.RawToken(); err == nil; _, err = p.RawToken() {
522 t.Fatalf("p.RawToken() = _, %v, want _, io.EOF", err)
526 func TestTrailingToken(t *testing.T) {
527 input := `<FOO></FOO> `
528 p := NewParser(StringReader(input))
530 for _, err = p.Token(); err == nil; _, err = p.Token() {
533 t.Fatalf("p.Token() = _, %v, want _, io.EOF", err)
537 func TestEntityInsideCDATA(t *testing.T) {
538 input := `<test><![CDATA[ &val=foo ]]></test>`
539 p := NewParser(StringReader(input))
541 for _, err = p.Token(); err == nil; _, err = p.Token() {
544 t.Fatalf("p.Token() = _, %v, want _, io.EOF", err)
548 // The last three tests (respectively one for characters in attribute
549 // names and two for character entities) pass not because of code
550 // changed for issue 1259, but instead pass with the given messages
551 // from other parts of xml.Parser. I provide these to note the
552 // current behavior of situations where one might think that character
553 // range checking would detect the error, but it does not in fact.
555 var characterTests = []struct {
559 {"\x12<doc/>", "illegal character code U+0012"},
560 {"<?xml version=\"1.0\"?>\x0b<doc/>", "illegal character code U+000B"},
561 {"\xef\xbf\xbe<doc/>", "illegal character code U+FFFE"},
562 {"<?xml version=\"1.0\"?><doc>\r\n<hiya/>\x07<toots/></doc>", "illegal character code U+0007"},
563 {"<?xml version=\"1.0\"?><doc \x12='value'>what's up</doc>", "expected attribute name in element"},
564 {"<doc>&\x01;</doc>", "invalid character entity &;"},
565 {"<doc>&\xef\xbf\xbe;</doc>", "invalid character entity &;"},
568 func TestDisallowedCharacters(t *testing.T) {
570 for i, tt := range characterTests {
571 p := NewParser(StringReader(tt.in))
577 synerr, ok := err.(*SyntaxError)
579 t.Fatalf("input %d p.Token() = _, %v, want _, *SyntaxError", i, err)
581 if synerr.Msg != tt.err {
582 t.Fatalf("input %d synerr.Msg wrong: want '%s', got '%s'", i, tt.err, synerr.Msg)
587 type procInstEncodingTest struct {
591 var procInstTests = []struct {
594 {`version="1.0" encoding="utf-8"`, "utf-8"},
595 {`version="1.0" encoding='utf-8'`, "utf-8"},
596 {`version="1.0" encoding='utf-8' `, "utf-8"},
597 {`version="1.0" encoding=utf-8`, ""},
598 {`encoding="FOO" `, "FOO"},
601 func TestProcInstEncoding(t *testing.T) {
602 for _, test := range procInstTests {
603 got := procInstEncoding(test.input)
604 if got != test.expect {
605 t.Errorf("procInstEncoding(%q) = %q; want %q", test.input, got, test.expect)