OSDN Git Service

Add Go frontend, libgo library, and Go testsuite.
[pf3gnuchains/gcc-fork.git] / libgo / go / xml / xml.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package xml implements a simple XML 1.0 parser that
6 // understands XML name spaces.
7 package xml
8
9 // References:
10 //    Annotated XML spec: http://www.xml.com/axml/testaxml.htm
11 //    XML name spaces: http://www.w3.org/TR/REC-xml-names/
12
13 // TODO(rsc):
14 //      Test error handling.
15
16 import (
17         "bufio"
18         "bytes"
19         "io"
20         "os"
21         "strconv"
22         "strings"
23         "unicode"
24         "utf8"
25 )
26
27 // A SyntaxError represents a syntax error in the XML input stream.
28 type SyntaxError struct {
29         Msg  string
30         Line int
31 }
32
33 func (e *SyntaxError) String() string {
34         return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
35 }
36
37 // A Name represents an XML name (Local) annotated
38 // with a name space identifier (Space).
39 // In tokens returned by Parser.Token, the Space identifier
40 // is given as a canonical URL, not the short prefix used
41 // in the document being parsed.
42 type Name struct {
43         Space, Local string
44 }
45
46 // An Attr represents an attribute in an XML element (Name=Value).
47 type Attr struct {
48         Name  Name
49         Value string
50 }
51
52 // A Token is an interface holding one of the token types:
53 // StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
54 type Token interface{}
55
56 // A StartElement represents an XML start element.
57 type StartElement struct {
58         Name Name
59         Attr []Attr
60 }
61
62 func (e StartElement) Copy() StartElement {
63         attrs := make([]Attr, len(e.Attr))
64         copy(e.Attr, attrs)
65         e.Attr = attrs
66         return e
67 }
68
69 // An EndElement represents an XML end element.
70 type EndElement struct {
71         Name Name
72 }
73
74 // A CharData represents XML character data (raw text),
75 // in which XML escape sequences have been replaced by
76 // the characters they represent.
77 type CharData []byte
78
79 func makeCopy(b []byte) []byte {
80         b1 := make([]byte, len(b))
81         copy(b1, b)
82         return b1
83 }
84
85 func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
86
87 // A Comment represents an XML comment of the form <!--comment-->.
88 // The bytes do not include the <!-- and --> comment markers.
89 type Comment []byte
90
91 func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
92
93 // A ProcInst represents an XML processing instruction of the form <?target inst?>
94 type ProcInst struct {
95         Target string
96         Inst   []byte
97 }
98
99 func (p ProcInst) Copy() ProcInst {
100         p.Inst = makeCopy(p.Inst)
101         return p
102 }
103
104 // A Directive represents an XML directive of the form <!text>.
105 // The bytes do not include the <! and > markers.
106 type Directive []byte
107
108 func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
109
110 // CopyToken returns a copy of a Token.
111 func CopyToken(t Token) Token {
112         switch v := t.(type) {
113         case CharData:
114                 return v.Copy()
115         case Comment:
116                 return v.Copy()
117         case Directive:
118                 return v.Copy()
119         case ProcInst:
120                 return v.Copy()
121         case StartElement:
122                 return v.Copy()
123         }
124         return t
125 }
126
127 // A Parser represents an XML parser reading a particular input stream.
128 // The parser assumes that its input is encoded in UTF-8.
129 type Parser struct {
130         // Strict defaults to true, enforcing the requirements
131         // of the XML specification.
132         // If set to false, the parser allows input containing common
133         // mistakes:
134         //      * If an element is missing an end tag, the parser invents
135         //        end tags as necessary to keep the return values from Token
136         //        properly balanced.
137         //      * In attribute values and character data, unknown or malformed
138         //        character entities (sequences beginning with &) are left alone.
139         //
140         // Setting:
141         //
142         //      p.Strict = false;
143         //      p.AutoClose = HTMLAutoClose;
144         //      p.Entity = HTMLEntity
145         //
146         // creates a parser that can handle typical HTML.
147         Strict bool
148
149         // When Strict == false, AutoClose indicates a set of elements to
150         // consider closed immediately after they are opened, regardless
151         // of whether an end element is present.
152         AutoClose []string
153
154         // Entity can be used to map non-standard entity names to string replacements.
155         // The parser behaves as if these standard mappings are present in the map,
156         // regardless of the actual map content:
157         //
158         //      "lt": "<",
159         //      "gt": ">",
160         //      "amp": "&",
161         //      "apos": "'",
162         //      "quot": `"`,
163         Entity map[string]string
164
165         r         io.ReadByter
166         buf       bytes.Buffer
167         saved     *bytes.Buffer
168         stk       *stack
169         free      *stack
170         needClose bool
171         toClose   Name
172         nextToken Token
173         nextByte  int
174         ns        map[string]string
175         err       os.Error
176         line      int
177         tmp       [32]byte
178 }
179
180 // NewParser creates a new XML parser reading from r.
181 func NewParser(r io.Reader) *Parser {
182         p := &Parser{
183                 ns:       make(map[string]string),
184                 nextByte: -1,
185                 line:     1,
186                 Strict:   true,
187         }
188
189         // Get efficient byte at a time reader.
190         // Assume that if reader has its own
191         // ReadByte, it's efficient enough.
192         // Otherwise, use bufio.
193         if rb, ok := r.(io.ReadByter); ok {
194                 p.r = rb
195         } else {
196                 p.r = bufio.NewReader(r)
197         }
198
199         return p
200 }
201
202 // Token returns the next XML token in the input stream.
203 // At the end of the input stream, Token returns nil, os.EOF.
204 //
205 // Slices of bytes in the returned token data refer to the
206 // parser's internal buffer and remain valid only until the next
207 // call to Token.  To acquire a copy of the bytes, call CopyToken
208 // or the token's Copy method.
209 //
210 // Token expands self-closing elements such as <br/>
211 // into separate start and end elements returned by successive calls.
212 //
213 // Token guarantees that the StartElement and EndElement
214 // tokens it returns are properly nested and matched:
215 // if Token encounters an unexpected end element,
216 // it will return an error.
217 //
218 // Token implements XML name spaces as described by
219 // http://www.w3.org/TR/REC-xml-names/.  Each of the
220 // Name structures contained in the Token has the Space
221 // set to the URL identifying its name space when known.
222 // If Token encounters an unrecognized name space prefix,
223 // it uses the prefix as the Space rather than report an error.
224 func (p *Parser) Token() (t Token, err os.Error) {
225         if p.nextToken != nil {
226                 t = p.nextToken
227                 p.nextToken = nil
228         } else if t, err = p.RawToken(); err != nil {
229                 return
230         }
231
232         if !p.Strict {
233                 if t1, ok := p.autoClose(t); ok {
234                         p.nextToken = t
235                         t = t1
236                 }
237         }
238         switch t1 := t.(type) {
239         case StartElement:
240                 // In XML name spaces, the translations listed in the
241                 // attributes apply to the element name and
242                 // to the other attribute names, so process
243                 // the translations first.
244                 for _, a := range t1.Attr {
245                         if a.Name.Space == "xmlns" {
246                                 v, ok := p.ns[a.Name.Local]
247                                 p.pushNs(a.Name.Local, v, ok)
248                                 p.ns[a.Name.Local] = a.Value
249                         }
250                         if a.Name.Space == "" && a.Name.Local == "xmlns" {
251                                 // Default space for untagged names
252                                 v, ok := p.ns[""]
253                                 p.pushNs("", v, ok)
254                                 p.ns[""] = a.Value
255                         }
256                 }
257
258                 p.translate(&t1.Name, true)
259                 for i := range t1.Attr {
260                         p.translate(&t1.Attr[i].Name, false)
261                 }
262                 p.pushElement(t1.Name)
263                 t = t1
264
265         case EndElement:
266                 p.translate(&t1.Name, true)
267                 if !p.popElement(&t1) {
268                         return nil, p.err
269                 }
270                 t = t1
271         }
272         return
273 }
274
275 // Apply name space translation to name n.
276 // The default name space (for Space=="")
277 // applies only to element names, not to attribute names.
278 func (p *Parser) translate(n *Name, isElementName bool) {
279         switch {
280         case n.Space == "xmlns":
281                 return
282         case n.Space == "" && !isElementName:
283                 return
284         case n.Space == "" && n.Local == "xmlns":
285                 return
286         }
287         if v, ok := p.ns[n.Space]; ok {
288                 n.Space = v
289         }
290 }
291
292 // Parsing state - stack holds old name space translations
293 // and the current set of open elements.  The translations to pop when
294 // ending a given tag are *below* it on the stack, which is
295 // more work but forced on us by XML.
296 type stack struct {
297         next *stack
298         kind int
299         name Name
300         ok   bool
301 }
302
303 const (
304         stkStart = iota
305         stkNs
306 )
307
308 func (p *Parser) push(kind int) *stack {
309         s := p.free
310         if s != nil {
311                 p.free = s.next
312         } else {
313                 s = new(stack)
314         }
315         s.next = p.stk
316         s.kind = kind
317         p.stk = s
318         return s
319 }
320
321 func (p *Parser) pop() *stack {
322         s := p.stk
323         if s != nil {
324                 p.stk = s.next
325                 s.next = p.free
326                 p.free = s
327         }
328         return s
329 }
330
331 // Record that we are starting an element with the given name.
332 func (p *Parser) pushElement(name Name) {
333         s := p.push(stkStart)
334         s.name = name
335 }
336
337 // Record that we are changing the value of ns[local].
338 // The old value is url, ok.
339 func (p *Parser) pushNs(local string, url string, ok bool) {
340         s := p.push(stkNs)
341         s.name.Local = local
342         s.name.Space = url
343         s.ok = ok
344 }
345
346 // Creates a SyntaxError with the current line number.
347 func (p *Parser) syntaxError(msg string) os.Error {
348         return &SyntaxError{Msg: msg, Line: p.line}
349 }
350
351 // Record that we are ending an element with the given name.
352 // The name must match the record at the top of the stack,
353 // which must be a pushElement record.
354 // After popping the element, apply any undo records from
355 // the stack to restore the name translations that existed
356 // before we saw this element.
357 func (p *Parser) popElement(t *EndElement) bool {
358         s := p.pop()
359         name := t.Name
360         switch {
361         case s == nil || s.kind != stkStart:
362                 p.err = p.syntaxError("unexpected end element </" + name.Local + ">")
363                 return false
364         case s.name.Local != name.Local:
365                 if !p.Strict {
366                         p.needClose = true
367                         p.toClose = t.Name
368                         t.Name = s.name
369                         return true
370                 }
371                 p.err = p.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
372                 return false
373         case s.name.Space != name.Space:
374                 p.err = p.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
375                         "closed by </" + name.Local + "> in space " + name.Space)
376                 return false
377         }
378
379         // Pop stack until a Start is on the top, undoing the
380         // translations that were associated with the element we just closed.
381         for p.stk != nil && p.stk.kind != stkStart {
382                 s := p.pop()
383                 p.ns[s.name.Local] = s.name.Space, s.ok
384         }
385
386         return true
387 }
388
389 // If the top element on the stack is autoclosing and
390 // t is not the end tag, invent the end tag.
391 func (p *Parser) autoClose(t Token) (Token, bool) {
392         if p.stk == nil || p.stk.kind != stkStart {
393                 return nil, false
394         }
395         name := strings.ToLower(p.stk.name.Local)
396         for _, s := range p.AutoClose {
397                 if strings.ToLower(s) == name {
398                         // This one should be auto closed if t doesn't close it.
399                         et, ok := t.(EndElement)
400                         if !ok || et.Name.Local != name {
401                                 return EndElement{p.stk.name}, true
402                         }
403                         break
404                 }
405         }
406         return nil, false
407 }
408
409
410 // RawToken is like Token but does not verify that
411 // start and end elements match and does not translate
412 // name space prefixes to their corresponding URLs.
413 func (p *Parser) RawToken() (Token, os.Error) {
414         if p.err != nil {
415                 return nil, p.err
416         }
417         if p.needClose {
418                 // The last element we read was self-closing and
419                 // we returned just the StartElement half.
420                 // Return the EndElement half now.
421                 p.needClose = false
422                 return EndElement{p.toClose}, nil
423         }
424
425         b, ok := p.getc()
426         if !ok {
427                 return nil, p.err
428         }
429
430         if b != '<' {
431                 // Text section.
432                 p.ungetc(b)
433                 data := p.text(-1, false)
434                 if data == nil {
435                         return nil, p.err
436                 }
437                 return CharData(data), nil
438         }
439
440         if b, ok = p.mustgetc(); !ok {
441                 return nil, p.err
442         }
443         switch b {
444         case '/':
445                 // </: End element
446                 var name Name
447                 if name, ok = p.nsname(); !ok {
448                         if p.err == nil {
449                                 p.err = p.syntaxError("expected element name after </")
450                         }
451                         return nil, p.err
452                 }
453                 p.space()
454                 if b, ok = p.mustgetc(); !ok {
455                         return nil, p.err
456                 }
457                 if b != '>' {
458                         p.err = p.syntaxError("invalid characters between </" + name.Local + " and >")
459                         return nil, p.err
460                 }
461                 return EndElement{name}, nil
462
463         case '?':
464                 // <?: Processing instruction.
465                 // TODO(rsc): Should parse the <?xml declaration to make sure
466                 // the version is 1.0 and the encoding is UTF-8.
467                 var target string
468                 if target, ok = p.name(); !ok {
469                         if p.err == nil {
470                                 p.err = p.syntaxError("expected target name after <?")
471                         }
472                         return nil, p.err
473                 }
474                 p.space()
475                 p.buf.Reset()
476                 var b0 byte
477                 for {
478                         if b, ok = p.mustgetc(); !ok {
479                                 return nil, p.err
480                         }
481                         p.buf.WriteByte(b)
482                         if b0 == '?' && b == '>' {
483                                 break
484                         }
485                         b0 = b
486                 }
487                 data := p.buf.Bytes()
488                 data = data[0 : len(data)-2] // chop ?>
489                 return ProcInst{target, data}, nil
490
491         case '!':
492                 // <!: Maybe comment, maybe CDATA.
493                 if b, ok = p.mustgetc(); !ok {
494                         return nil, p.err
495                 }
496                 switch b {
497                 case '-': // <!-
498                         // Probably <!-- for a comment.
499                         if b, ok = p.mustgetc(); !ok {
500                                 return nil, p.err
501                         }
502                         if b != '-' {
503                                 p.err = p.syntaxError("invalid sequence <!- not part of <!--")
504                                 return nil, p.err
505                         }
506                         // Look for terminator.
507                         p.buf.Reset()
508                         var b0, b1 byte
509                         for {
510                                 if b, ok = p.mustgetc(); !ok {
511                                         return nil, p.err
512                                 }
513                                 p.buf.WriteByte(b)
514                                 if b0 == '-' && b1 == '-' && b == '>' {
515                                         break
516                                 }
517                                 b0, b1 = b1, b
518                         }
519                         data := p.buf.Bytes()
520                         data = data[0 : len(data)-3] // chop -->
521                         return Comment(data), nil
522
523                 case '[': // <![
524                         // Probably <![CDATA[.
525                         for i := 0; i < 6; i++ {
526                                 if b, ok = p.mustgetc(); !ok {
527                                         return nil, p.err
528                                 }
529                                 if b != "CDATA["[i] {
530                                         p.err = p.syntaxError("invalid <![ sequence")
531                                         return nil, p.err
532                                 }
533                         }
534                         // Have <![CDATA[.  Read text until ]]>.
535                         data := p.text(-1, true)
536                         if data == nil {
537                                 return nil, p.err
538                         }
539                         return CharData(data), nil
540                 }
541
542                 // Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
543                 // We don't care, but accumulate for caller.
544                 p.buf.Reset()
545                 p.buf.WriteByte(b)
546                 for {
547                         if b, ok = p.mustgetc(); !ok {
548                                 return nil, p.err
549                         }
550                         if b == '>' {
551                                 break
552                         }
553                         p.buf.WriteByte(b)
554                 }
555                 return Directive(p.buf.Bytes()), nil
556         }
557
558         // Must be an open element like <a href="foo">
559         p.ungetc(b)
560
561         var (
562                 name  Name
563                 empty bool
564                 attr  []Attr
565         )
566         if name, ok = p.nsname(); !ok {
567                 if p.err == nil {
568                         p.err = p.syntaxError("expected element name after <")
569                 }
570                 return nil, p.err
571         }
572
573         attr = make([]Attr, 0, 4)
574         for {
575                 p.space()
576                 if b, ok = p.mustgetc(); !ok {
577                         return nil, p.err
578                 }
579                 if b == '/' {
580                         empty = true
581                         if b, ok = p.mustgetc(); !ok {
582                                 return nil, p.err
583                         }
584                         if b != '>' {
585                                 p.err = p.syntaxError("expected /> in element")
586                                 return nil, p.err
587                         }
588                         break
589                 }
590                 if b == '>' {
591                         break
592                 }
593                 p.ungetc(b)
594
595                 n := len(attr)
596                 if n >= cap(attr) {
597                         nattr := make([]Attr, n, 2*cap(attr))
598                         copy(nattr, attr)
599                         attr = nattr
600                 }
601                 attr = attr[0 : n+1]
602                 a := &attr[n]
603                 if a.Name, ok = p.nsname(); !ok {
604                         if p.err == nil {
605                                 p.err = p.syntaxError("expected attribute name in element")
606                         }
607                         return nil, p.err
608                 }
609                 p.space()
610                 if b, ok = p.mustgetc(); !ok {
611                         return nil, p.err
612                 }
613                 if b != '=' {
614                         p.err = p.syntaxError("attribute name without = in element")
615                         return nil, p.err
616                 }
617                 p.space()
618                 data := p.attrval()
619                 if data == nil {
620                         return nil, p.err
621                 }
622                 a.Value = string(data)
623         }
624
625         if empty {
626                 p.needClose = true
627                 p.toClose = name
628         }
629         return StartElement{name, attr}, nil
630 }
631
632 func (p *Parser) attrval() []byte {
633         b, ok := p.mustgetc()
634         if !ok {
635                 return nil
636         }
637         // Handle quoted attribute values
638         if b == '"' || b == '\'' {
639                 return p.text(int(b), false)
640         }
641         // Handle unquoted attribute values for strict parsers
642         if p.Strict {
643                 p.err = p.syntaxError("unquoted or missing attribute value in element")
644                 return nil
645         }
646         // Handle unquoted attribute values for unstrict parsers
647         p.ungetc(b)
648         p.buf.Reset()
649         for {
650                 b, ok = p.mustgetc()
651                 if !ok {
652                         return nil
653                 }
654                 // http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
655                 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
656                         '0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
657                         p.buf.WriteByte(b)
658                 } else {
659                         p.ungetc(b)
660                         break
661                 }
662         }
663         return p.buf.Bytes()
664 }
665
666 // Skip spaces if any
667 func (p *Parser) space() {
668         for {
669                 b, ok := p.getc()
670                 if !ok {
671                         return
672                 }
673                 switch b {
674                 case ' ', '\r', '\n', '\t':
675                 default:
676                         p.ungetc(b)
677                         return
678                 }
679         }
680 }
681
682 // Read a single byte.
683 // If there is no byte to read, return ok==false
684 // and leave the error in p.err.
685 // Maintain line number.
686 func (p *Parser) getc() (b byte, ok bool) {
687         if p.err != nil {
688                 return 0, false
689         }
690         if p.nextByte >= 0 {
691                 b = byte(p.nextByte)
692                 p.nextByte = -1
693         } else {
694                 b, p.err = p.r.ReadByte()
695                 if p.err != nil {
696                         return 0, false
697                 }
698                 if p.saved != nil {
699                         p.saved.WriteByte(b)
700                 }
701         }
702         if b == '\n' {
703                 p.line++
704         }
705         return b, true
706 }
707
708 // Return saved offset.
709 // If we did ungetc (nextByte >= 0), have to back up one.
710 func (p *Parser) savedOffset() int {
711         n := p.saved.Len()
712         if p.nextByte >= 0 {
713                 n--
714         }
715         return n
716 }
717
718 // Must read a single byte.
719 // If there is no byte to read,
720 // set p.err to SyntaxError("unexpected EOF")
721 // and return ok==false
722 func (p *Parser) mustgetc() (b byte, ok bool) {
723         if b, ok = p.getc(); !ok {
724                 if p.err == os.EOF {
725                         p.err = p.syntaxError("unexpected EOF")
726                 }
727         }
728         return
729 }
730
731 // Unread a single byte.
732 func (p *Parser) ungetc(b byte) {
733         if b == '\n' {
734                 p.line--
735         }
736         p.nextByte = int(b)
737 }
738
739 var entity = map[string]int{
740         "lt":   '<',
741         "gt":   '>',
742         "amp":  '&',
743         "apos": '\'',
744         "quot": '"',
745 }
746
747 // Read plain text section (XML calls it character data).
748 // If quote >= 0, we are in a quoted string and need to find the matching quote.
749 // If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
750 // On failure return nil and leave the error in p.err.
751 func (p *Parser) text(quote int, cdata bool) []byte {
752         var b0, b1 byte
753         var trunc int
754         p.buf.Reset()
755 Input:
756         for {
757                 b, ok := p.getc()
758                 if !ok {
759                         if cdata {
760                                 if p.err == os.EOF {
761                                         p.err = p.syntaxError("unexpected EOF in CDATA section")
762                                 }
763                                 return nil
764                         }
765                         break Input
766                 }
767
768                 // <![CDATA[ section ends with ]]>.
769                 // It is an error for ]]> to appear in ordinary text.
770                 if b0 == ']' && b1 == ']' && b == '>' {
771                         if cdata {
772                                 trunc = 2
773                                 break Input
774                         }
775                         p.err = p.syntaxError("unescaped ]]> not in CDATA section")
776                         return nil
777                 }
778
779                 // Stop reading text if we see a <.
780                 if b == '<' && !cdata {
781                         if quote >= 0 {
782                                 p.err = p.syntaxError("unescaped < inside quoted string")
783                                 return nil
784                         }
785                         p.ungetc('<')
786                         break Input
787                 }
788                 if quote >= 0 && b == byte(quote) {
789                         break Input
790                 }
791                 if b == '&' && !cdata {
792                         // Read escaped character expression up to semicolon.
793                         // XML in all its glory allows a document to define and use
794                         // its own character names with <!ENTITY ...> directives.
795                         // Parsers are required to recognize lt, gt, amp, apos, and quot
796                         // even if they have not been declared.  That's all we allow.
797                         var i int
798                 CharLoop:
799                         for i = 0; i < len(p.tmp); i++ {
800                                 var ok bool
801                                 p.tmp[i], ok = p.getc()
802                                 if !ok {
803                                         if p.err == os.EOF {
804                                                 p.err = p.syntaxError("unexpected EOF")
805                                         }
806                                         return nil
807                                 }
808                                 c := p.tmp[i]
809                                 if c == ';' {
810                                         break
811                                 }
812                                 if 'a' <= c && c <= 'z' ||
813                                         'A' <= c && c <= 'Z' ||
814                                         '0' <= c && c <= '9' ||
815                                         c == '_' || c == '#' {
816                                         continue
817                                 }
818                                 p.ungetc(c)
819                                 break
820                         }
821                         s := string(p.tmp[0:i])
822                         if i >= len(p.tmp) {
823                                 if !p.Strict {
824                                         b0, b1 = 0, 0
825                                         p.buf.WriteByte('&')
826                                         p.buf.Write(p.tmp[0:i])
827                                         continue Input
828                                 }
829                                 p.err = p.syntaxError("character entity expression &" + s + "... too long")
830                                 return nil
831                         }
832                         var haveText bool
833                         var text string
834                         if i >= 2 && s[0] == '#' {
835                                 var n uint64
836                                 var err os.Error
837                                 if i >= 3 && s[1] == 'x' {
838                                         n, err = strconv.Btoui64(s[2:], 16)
839                                 } else {
840                                         n, err = strconv.Btoui64(s[1:], 10)
841                                 }
842                                 if err == nil && n <= unicode.MaxRune {
843                                         text = string(n)
844                                         haveText = true
845                                 }
846                         } else {
847                                 if r, ok := entity[s]; ok {
848                                         text = string(r)
849                                         haveText = true
850                                 } else if p.Entity != nil {
851                                         text, haveText = p.Entity[s]
852                                 }
853                         }
854                         if !haveText {
855                                 if !p.Strict {
856                                         b0, b1 = 0, 0
857                                         p.buf.WriteByte('&')
858                                         p.buf.Write(p.tmp[0:i])
859                                         continue Input
860                                 }
861                                 p.err = p.syntaxError("invalid character entity &" + s + ";")
862                                 return nil
863                         }
864                         p.buf.Write([]byte(text))
865                         b0, b1 = 0, 0
866                         continue Input
867                 }
868                 p.buf.WriteByte(b)
869                 b0, b1 = b1, b
870         }
871         data := p.buf.Bytes()
872         data = data[0 : len(data)-trunc]
873
874         // Must rewrite \r and \r\n into \n.
875         w := 0
876         for r := 0; r < len(data); r++ {
877                 b := data[r]
878                 if b == '\r' {
879                         if r+1 < len(data) && data[r+1] == '\n' {
880                                 continue
881                         }
882                         b = '\n'
883                 }
884                 data[w] = b
885                 w++
886         }
887         return data[0:w]
888 }
889
890 // Get name space name: name with a : stuck in the middle.
891 // The part before the : is the name space identifier.
892 func (p *Parser) nsname() (name Name, ok bool) {
893         s, ok := p.name()
894         if !ok {
895                 return
896         }
897         i := strings.Index(s, ":")
898         if i < 0 {
899                 name.Local = s
900         } else {
901                 name.Space = s[0:i]
902                 name.Local = s[i+1:]
903         }
904         return name, true
905 }
906
907 // Get name: /first(first|second)*/
908 // Do not set p.err if the name is missing (unless unexpected EOF is received):
909 // let the caller provide better context.
910 func (p *Parser) name() (s string, ok bool) {
911         var b byte
912         if b, ok = p.mustgetc(); !ok {
913                 return
914         }
915
916         // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
917         if b < utf8.RuneSelf && !isNameByte(b) {
918                 p.ungetc(b)
919                 return "", false
920         }
921         p.buf.Reset()
922         p.buf.WriteByte(b)
923         for {
924                 if b, ok = p.mustgetc(); !ok {
925                         return
926                 }
927                 if b < utf8.RuneSelf && !isNameByte(b) {
928                         p.ungetc(b)
929                         break
930                 }
931                 p.buf.WriteByte(b)
932         }
933
934         // Then we check the characters.
935         s = p.buf.String()
936         for i, c := range s {
937                 if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
938                         p.err = p.syntaxError("invalid XML name: " + s)
939                         return "", false
940                 }
941         }
942         return s, true
943 }
944
945 func isNameByte(c byte) bool {
946         return 'A' <= c && c <= 'Z' ||
947                 'a' <= c && c <= 'z' ||
948                 '0' <= c && c <= '9' ||
949                 c == '_' || c == ':' || c == '.' || c == '-'
950 }
951
952 // These tables were generated by cut and paste from Appendix B of
953 // the XML spec at http://www.xml.com/axml/testaxml.htm
954 // and then reformatting.  First corresponds to (Letter | '_' | ':')
955 // and second corresponds to NameChar.
956
957 var first = []unicode.Range{
958         {0x003A, 0x003A, 1},
959         {0x0041, 0x005A, 1},
960         {0x005F, 0x005F, 1},
961         {0x0061, 0x007A, 1},
962         {0x00C0, 0x00D6, 1},
963         {0x00D8, 0x00F6, 1},
964         {0x00F8, 0x00FF, 1},
965         {0x0100, 0x0131, 1},
966         {0x0134, 0x013E, 1},
967         {0x0141, 0x0148, 1},
968         {0x014A, 0x017E, 1},
969         {0x0180, 0x01C3, 1},
970         {0x01CD, 0x01F0, 1},
971         {0x01F4, 0x01F5, 1},
972         {0x01FA, 0x0217, 1},
973         {0x0250, 0x02A8, 1},
974         {0x02BB, 0x02C1, 1},
975         {0x0386, 0x0386, 1},
976         {0x0388, 0x038A, 1},
977         {0x038C, 0x038C, 1},
978         {0x038E, 0x03A1, 1},
979         {0x03A3, 0x03CE, 1},
980         {0x03D0, 0x03D6, 1},
981         {0x03DA, 0x03E0, 2},
982         {0x03E2, 0x03F3, 1},
983         {0x0401, 0x040C, 1},
984         {0x040E, 0x044F, 1},
985         {0x0451, 0x045C, 1},
986         {0x045E, 0x0481, 1},
987         {0x0490, 0x04C4, 1},
988         {0x04C7, 0x04C8, 1},
989         {0x04CB, 0x04CC, 1},
990         {0x04D0, 0x04EB, 1},
991         {0x04EE, 0x04F5, 1},
992         {0x04F8, 0x04F9, 1},
993         {0x0531, 0x0556, 1},
994         {0x0559, 0x0559, 1},
995         {0x0561, 0x0586, 1},
996         {0x05D0, 0x05EA, 1},
997         {0x05F0, 0x05F2, 1},
998         {0x0621, 0x063A, 1},
999         {0x0641, 0x064A, 1},
1000         {0x0671, 0x06B7, 1},
1001         {0x06BA, 0x06BE, 1},
1002         {0x06C0, 0x06CE, 1},
1003         {0x06D0, 0x06D3, 1},
1004         {0x06D5, 0x06D5, 1},
1005         {0x06E5, 0x06E6, 1},
1006         {0x0905, 0x0939, 1},
1007         {0x093D, 0x093D, 1},
1008         {0x0958, 0x0961, 1},
1009         {0x0985, 0x098C, 1},
1010         {0x098F, 0x0990, 1},
1011         {0x0993, 0x09A8, 1},
1012         {0x09AA, 0x09B0, 1},
1013         {0x09B2, 0x09B2, 1},
1014         {0x09B6, 0x09B9, 1},
1015         {0x09DC, 0x09DD, 1},
1016         {0x09DF, 0x09E1, 1},
1017         {0x09F0, 0x09F1, 1},
1018         {0x0A05, 0x0A0A, 1},
1019         {0x0A0F, 0x0A10, 1},
1020         {0x0A13, 0x0A28, 1},
1021         {0x0A2A, 0x0A30, 1},
1022         {0x0A32, 0x0A33, 1},
1023         {0x0A35, 0x0A36, 1},
1024         {0x0A38, 0x0A39, 1},
1025         {0x0A59, 0x0A5C, 1},
1026         {0x0A5E, 0x0A5E, 1},
1027         {0x0A72, 0x0A74, 1},
1028         {0x0A85, 0x0A8B, 1},
1029         {0x0A8D, 0x0A8D, 1},
1030         {0x0A8F, 0x0A91, 1},
1031         {0x0A93, 0x0AA8, 1},
1032         {0x0AAA, 0x0AB0, 1},
1033         {0x0AB2, 0x0AB3, 1},
1034         {0x0AB5, 0x0AB9, 1},
1035         {0x0ABD, 0x0AE0, 0x23},
1036         {0x0B05, 0x0B0C, 1},
1037         {0x0B0F, 0x0B10, 1},
1038         {0x0B13, 0x0B28, 1},
1039         {0x0B2A, 0x0B30, 1},
1040         {0x0B32, 0x0B33, 1},
1041         {0x0B36, 0x0B39, 1},
1042         {0x0B3D, 0x0B3D, 1},
1043         {0x0B5C, 0x0B5D, 1},
1044         {0x0B5F, 0x0B61, 1},
1045         {0x0B85, 0x0B8A, 1},
1046         {0x0B8E, 0x0B90, 1},
1047         {0x0B92, 0x0B95, 1},
1048         {0x0B99, 0x0B9A, 1},
1049         {0x0B9C, 0x0B9C, 1},
1050         {0x0B9E, 0x0B9F, 1},
1051         {0x0BA3, 0x0BA4, 1},
1052         {0x0BA8, 0x0BAA, 1},
1053         {0x0BAE, 0x0BB5, 1},
1054         {0x0BB7, 0x0BB9, 1},
1055         {0x0C05, 0x0C0C, 1},
1056         {0x0C0E, 0x0C10, 1},
1057         {0x0C12, 0x0C28, 1},
1058         {0x0C2A, 0x0C33, 1},
1059         {0x0C35, 0x0C39, 1},
1060         {0x0C60, 0x0C61, 1},
1061         {0x0C85, 0x0C8C, 1},
1062         {0x0C8E, 0x0C90, 1},
1063         {0x0C92, 0x0CA8, 1},
1064         {0x0CAA, 0x0CB3, 1},
1065         {0x0CB5, 0x0CB9, 1},
1066         {0x0CDE, 0x0CDE, 1},
1067         {0x0CE0, 0x0CE1, 1},
1068         {0x0D05, 0x0D0C, 1},
1069         {0x0D0E, 0x0D10, 1},
1070         {0x0D12, 0x0D28, 1},
1071         {0x0D2A, 0x0D39, 1},
1072         {0x0D60, 0x0D61, 1},
1073         {0x0E01, 0x0E2E, 1},
1074         {0x0E30, 0x0E30, 1},
1075         {0x0E32, 0x0E33, 1},
1076         {0x0E40, 0x0E45, 1},
1077         {0x0E81, 0x0E82, 1},
1078         {0x0E84, 0x0E84, 1},
1079         {0x0E87, 0x0E88, 1},
1080         {0x0E8A, 0x0E8D, 3},
1081         {0x0E94, 0x0E97, 1},
1082         {0x0E99, 0x0E9F, 1},
1083         {0x0EA1, 0x0EA3, 1},
1084         {0x0EA5, 0x0EA7, 2},
1085         {0x0EAA, 0x0EAB, 1},
1086         {0x0EAD, 0x0EAE, 1},
1087         {0x0EB0, 0x0EB0, 1},
1088         {0x0EB2, 0x0EB3, 1},
1089         {0x0EBD, 0x0EBD, 1},
1090         {0x0EC0, 0x0EC4, 1},
1091         {0x0F40, 0x0F47, 1},
1092         {0x0F49, 0x0F69, 1},
1093         {0x10A0, 0x10C5, 1},
1094         {0x10D0, 0x10F6, 1},
1095         {0x1100, 0x1100, 1},
1096         {0x1102, 0x1103, 1},
1097         {0x1105, 0x1107, 1},
1098         {0x1109, 0x1109, 1},
1099         {0x110B, 0x110C, 1},
1100         {0x110E, 0x1112, 1},
1101         {0x113C, 0x1140, 2},
1102         {0x114C, 0x1150, 2},
1103         {0x1154, 0x1155, 1},
1104         {0x1159, 0x1159, 1},
1105         {0x115F, 0x1161, 1},
1106         {0x1163, 0x1169, 2},
1107         {0x116D, 0x116E, 1},
1108         {0x1172, 0x1173, 1},
1109         {0x1175, 0x119E, 0x119E - 0x1175},
1110         {0x11A8, 0x11AB, 0x11AB - 0x11A8},
1111         {0x11AE, 0x11AF, 1},
1112         {0x11B7, 0x11B8, 1},
1113         {0x11BA, 0x11BA, 1},
1114         {0x11BC, 0x11C2, 1},
1115         {0x11EB, 0x11F0, 0x11F0 - 0x11EB},
1116         {0x11F9, 0x11F9, 1},
1117         {0x1E00, 0x1E9B, 1},
1118         {0x1EA0, 0x1EF9, 1},
1119         {0x1F00, 0x1F15, 1},
1120         {0x1F18, 0x1F1D, 1},
1121         {0x1F20, 0x1F45, 1},
1122         {0x1F48, 0x1F4D, 1},
1123         {0x1F50, 0x1F57, 1},
1124         {0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
1125         {0x1F5D, 0x1F5D, 1},
1126         {0x1F5F, 0x1F7D, 1},
1127         {0x1F80, 0x1FB4, 1},
1128         {0x1FB6, 0x1FBC, 1},
1129         {0x1FBE, 0x1FBE, 1},
1130         {0x1FC2, 0x1FC4, 1},
1131         {0x1FC6, 0x1FCC, 1},
1132         {0x1FD0, 0x1FD3, 1},
1133         {0x1FD6, 0x1FDB, 1},
1134         {0x1FE0, 0x1FEC, 1},
1135         {0x1FF2, 0x1FF4, 1},
1136         {0x1FF6, 0x1FFC, 1},
1137         {0x2126, 0x2126, 1},
1138         {0x212A, 0x212B, 1},
1139         {0x212E, 0x212E, 1},
1140         {0x2180, 0x2182, 1},
1141         {0x3007, 0x3007, 1},
1142         {0x3021, 0x3029, 1},
1143         {0x3041, 0x3094, 1},
1144         {0x30A1, 0x30FA, 1},
1145         {0x3105, 0x312C, 1},
1146         {0x4E00, 0x9FA5, 1},
1147         {0xAC00, 0xD7A3, 1},
1148 }
1149
1150 var second = []unicode.Range{
1151         {0x002D, 0x002E, 1},
1152         {0x0030, 0x0039, 1},
1153         {0x00B7, 0x00B7, 1},
1154         {0x02D0, 0x02D1, 1},
1155         {0x0300, 0x0345, 1},
1156         {0x0360, 0x0361, 1},
1157         {0x0387, 0x0387, 1},
1158         {0x0483, 0x0486, 1},
1159         {0x0591, 0x05A1, 1},
1160         {0x05A3, 0x05B9, 1},
1161         {0x05BB, 0x05BD, 1},
1162         {0x05BF, 0x05BF, 1},
1163         {0x05C1, 0x05C2, 1},
1164         {0x05C4, 0x0640, 0x0640 - 0x05C4},
1165         {0x064B, 0x0652, 1},
1166         {0x0660, 0x0669, 1},
1167         {0x0670, 0x0670, 1},
1168         {0x06D6, 0x06DC, 1},
1169         {0x06DD, 0x06DF, 1},
1170         {0x06E0, 0x06E4, 1},
1171         {0x06E7, 0x06E8, 1},
1172         {0x06EA, 0x06ED, 1},
1173         {0x06F0, 0x06F9, 1},
1174         {0x0901, 0x0903, 1},
1175         {0x093C, 0x093C, 1},
1176         {0x093E, 0x094C, 1},
1177         {0x094D, 0x094D, 1},
1178         {0x0951, 0x0954, 1},
1179         {0x0962, 0x0963, 1},
1180         {0x0966, 0x096F, 1},
1181         {0x0981, 0x0983, 1},
1182         {0x09BC, 0x09BC, 1},
1183         {0x09BE, 0x09BF, 1},
1184         {0x09C0, 0x09C4, 1},
1185         {0x09C7, 0x09C8, 1},
1186         {0x09CB, 0x09CD, 1},
1187         {0x09D7, 0x09D7, 1},
1188         {0x09E2, 0x09E3, 1},
1189         {0x09E6, 0x09EF, 1},
1190         {0x0A02, 0x0A3C, 0x3A},
1191         {0x0A3E, 0x0A3F, 1},
1192         {0x0A40, 0x0A42, 1},
1193         {0x0A47, 0x0A48, 1},
1194         {0x0A4B, 0x0A4D, 1},
1195         {0x0A66, 0x0A6F, 1},
1196         {0x0A70, 0x0A71, 1},
1197         {0x0A81, 0x0A83, 1},
1198         {0x0ABC, 0x0ABC, 1},
1199         {0x0ABE, 0x0AC5, 1},
1200         {0x0AC7, 0x0AC9, 1},
1201         {0x0ACB, 0x0ACD, 1},
1202         {0x0AE6, 0x0AEF, 1},
1203         {0x0B01, 0x0B03, 1},
1204         {0x0B3C, 0x0B3C, 1},
1205         {0x0B3E, 0x0B43, 1},
1206         {0x0B47, 0x0B48, 1},
1207         {0x0B4B, 0x0B4D, 1},
1208         {0x0B56, 0x0B57, 1},
1209         {0x0B66, 0x0B6F, 1},
1210         {0x0B82, 0x0B83, 1},
1211         {0x0BBE, 0x0BC2, 1},
1212         {0x0BC6, 0x0BC8, 1},
1213         {0x0BCA, 0x0BCD, 1},
1214         {0x0BD7, 0x0BD7, 1},
1215         {0x0BE7, 0x0BEF, 1},
1216         {0x0C01, 0x0C03, 1},
1217         {0x0C3E, 0x0C44, 1},
1218         {0x0C46, 0x0C48, 1},
1219         {0x0C4A, 0x0C4D, 1},
1220         {0x0C55, 0x0C56, 1},
1221         {0x0C66, 0x0C6F, 1},
1222         {0x0C82, 0x0C83, 1},
1223         {0x0CBE, 0x0CC4, 1},
1224         {0x0CC6, 0x0CC8, 1},
1225         {0x0CCA, 0x0CCD, 1},
1226         {0x0CD5, 0x0CD6, 1},
1227         {0x0CE6, 0x0CEF, 1},
1228         {0x0D02, 0x0D03, 1},
1229         {0x0D3E, 0x0D43, 1},
1230         {0x0D46, 0x0D48, 1},
1231         {0x0D4A, 0x0D4D, 1},
1232         {0x0D57, 0x0D57, 1},
1233         {0x0D66, 0x0D6F, 1},
1234         {0x0E31, 0x0E31, 1},
1235         {0x0E34, 0x0E3A, 1},
1236         {0x0E46, 0x0E46, 1},
1237         {0x0E47, 0x0E4E, 1},
1238         {0x0E50, 0x0E59, 1},
1239         {0x0EB1, 0x0EB1, 1},
1240         {0x0EB4, 0x0EB9, 1},
1241         {0x0EBB, 0x0EBC, 1},
1242         {0x0EC6, 0x0EC6, 1},
1243         {0x0EC8, 0x0ECD, 1},
1244         {0x0ED0, 0x0ED9, 1},
1245         {0x0F18, 0x0F19, 1},
1246         {0x0F20, 0x0F29, 1},
1247         {0x0F35, 0x0F39, 2},
1248         {0x0F3E, 0x0F3F, 1},
1249         {0x0F71, 0x0F84, 1},
1250         {0x0F86, 0x0F8B, 1},
1251         {0x0F90, 0x0F95, 1},
1252         {0x0F97, 0x0F97, 1},
1253         {0x0F99, 0x0FAD, 1},
1254         {0x0FB1, 0x0FB7, 1},
1255         {0x0FB9, 0x0FB9, 1},
1256         {0x20D0, 0x20DC, 1},
1257         {0x20E1, 0x3005, 0x3005 - 0x20E1},
1258         {0x302A, 0x302F, 1},
1259         {0x3031, 0x3035, 1},
1260         {0x3099, 0x309A, 1},
1261         {0x309D, 0x309E, 1},
1262         {0x30FC, 0x30FE, 1},
1263 }
1264
1265 // HTMLEntity is an entity map containing translations for the
1266 // standard HTML entity characters.
1267 var HTMLEntity = htmlEntity
1268
1269 var htmlEntity = map[string]string{
1270         /*
1271                 hget http://www.w3.org/TR/html4/sgml/entities.html |
1272                 ssam '
1273                         ,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
1274                         ,x v/^\&lt;!ENTITY/d
1275                         ,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/     "\1": "\\u\2",/g
1276                 '
1277         */
1278         "nbsp":     "\u00A0",
1279         "iexcl":    "\u00A1",
1280         "cent":     "\u00A2",
1281         "pound":    "\u00A3",
1282         "curren":   "\u00A4",
1283         "yen":      "\u00A5",
1284         "brvbar":   "\u00A6",
1285         "sect":     "\u00A7",
1286         "uml":      "\u00A8",
1287         "copy":     "\u00A9",
1288         "ordf":     "\u00AA",
1289         "laquo":    "\u00AB",
1290         "not":      "\u00AC",
1291         "shy":      "\u00AD",
1292         "reg":      "\u00AE",
1293         "macr":     "\u00AF",
1294         "deg":      "\u00B0",
1295         "plusmn":   "\u00B1",
1296         "sup2":     "\u00B2",
1297         "sup3":     "\u00B3",
1298         "acute":    "\u00B4",
1299         "micro":    "\u00B5",
1300         "para":     "\u00B6",
1301         "middot":   "\u00B7",
1302         "cedil":    "\u00B8",
1303         "sup1":     "\u00B9",
1304         "ordm":     "\u00BA",
1305         "raquo":    "\u00BB",
1306         "frac14":   "\u00BC",
1307         "frac12":   "\u00BD",
1308         "frac34":   "\u00BE",
1309         "iquest":   "\u00BF",
1310         "Agrave":   "\u00C0",
1311         "Aacute":   "\u00C1",
1312         "Acirc":    "\u00C2",
1313         "Atilde":   "\u00C3",
1314         "Auml":     "\u00C4",
1315         "Aring":    "\u00C5",
1316         "AElig":    "\u00C6",
1317         "Ccedil":   "\u00C7",
1318         "Egrave":   "\u00C8",
1319         "Eacute":   "\u00C9",
1320         "Ecirc":    "\u00CA",
1321         "Euml":     "\u00CB",
1322         "Igrave":   "\u00CC",
1323         "Iacute":   "\u00CD",
1324         "Icirc":    "\u00CE",
1325         "Iuml":     "\u00CF",
1326         "ETH":      "\u00D0",
1327         "Ntilde":   "\u00D1",
1328         "Ograve":   "\u00D2",
1329         "Oacute":   "\u00D3",
1330         "Ocirc":    "\u00D4",
1331         "Otilde":   "\u00D5",
1332         "Ouml":     "\u00D6",
1333         "times":    "\u00D7",
1334         "Oslash":   "\u00D8",
1335         "Ugrave":   "\u00D9",
1336         "Uacute":   "\u00DA",
1337         "Ucirc":    "\u00DB",
1338         "Uuml":     "\u00DC",
1339         "Yacute":   "\u00DD",
1340         "THORN":    "\u00DE",
1341         "szlig":    "\u00DF",
1342         "agrave":   "\u00E0",
1343         "aacute":   "\u00E1",
1344         "acirc":    "\u00E2",
1345         "atilde":   "\u00E3",
1346         "auml":     "\u00E4",
1347         "aring":    "\u00E5",
1348         "aelig":    "\u00E6",
1349         "ccedil":   "\u00E7",
1350         "egrave":   "\u00E8",
1351         "eacute":   "\u00E9",
1352         "ecirc":    "\u00EA",
1353         "euml":     "\u00EB",
1354         "igrave":   "\u00EC",
1355         "iacute":   "\u00ED",
1356         "icirc":    "\u00EE",
1357         "iuml":     "\u00EF",
1358         "eth":      "\u00F0",
1359         "ntilde":   "\u00F1",
1360         "ograve":   "\u00F2",
1361         "oacute":   "\u00F3",
1362         "ocirc":    "\u00F4",
1363         "otilde":   "\u00F5",
1364         "ouml":     "\u00F6",
1365         "divide":   "\u00F7",
1366         "oslash":   "\u00F8",
1367         "ugrave":   "\u00F9",
1368         "uacute":   "\u00FA",
1369         "ucirc":    "\u00FB",
1370         "uuml":     "\u00FC",
1371         "yacute":   "\u00FD",
1372         "thorn":    "\u00FE",
1373         "yuml":     "\u00FF",
1374         "fnof":     "\u0192",
1375         "Alpha":    "\u0391",
1376         "Beta":     "\u0392",
1377         "Gamma":    "\u0393",
1378         "Delta":    "\u0394",
1379         "Epsilon":  "\u0395",
1380         "Zeta":     "\u0396",
1381         "Eta":      "\u0397",
1382         "Theta":    "\u0398",
1383         "Iota":     "\u0399",
1384         "Kappa":    "\u039A",
1385         "Lambda":   "\u039B",
1386         "Mu":       "\u039C",
1387         "Nu":       "\u039D",
1388         "Xi":       "\u039E",
1389         "Omicron":  "\u039F",
1390         "Pi":       "\u03A0",
1391         "Rho":      "\u03A1",
1392         "Sigma":    "\u03A3",
1393         "Tau":      "\u03A4",
1394         "Upsilon":  "\u03A5",
1395         "Phi":      "\u03A6",
1396         "Chi":      "\u03A7",
1397         "Psi":      "\u03A8",
1398         "Omega":    "\u03A9",
1399         "alpha":    "\u03B1",
1400         "beta":     "\u03B2",
1401         "gamma":    "\u03B3",
1402         "delta":    "\u03B4",
1403         "epsilon":  "\u03B5",
1404         "zeta":     "\u03B6",
1405         "eta":      "\u03B7",
1406         "theta":    "\u03B8",
1407         "iota":     "\u03B9",
1408         "kappa":    "\u03BA",
1409         "lambda":   "\u03BB",
1410         "mu":       "\u03BC",
1411         "nu":       "\u03BD",
1412         "xi":       "\u03BE",
1413         "omicron":  "\u03BF",
1414         "pi":       "\u03C0",
1415         "rho":      "\u03C1",
1416         "sigmaf":   "\u03C2",
1417         "sigma":    "\u03C3",
1418         "tau":      "\u03C4",
1419         "upsilon":  "\u03C5",
1420         "phi":      "\u03C6",
1421         "chi":      "\u03C7",
1422         "psi":      "\u03C8",
1423         "omega":    "\u03C9",
1424         "thetasym": "\u03D1",
1425         "upsih":    "\u03D2",
1426         "piv":      "\u03D6",
1427         "bull":     "\u2022",
1428         "hellip":   "\u2026",
1429         "prime":    "\u2032",
1430         "Prime":    "\u2033",
1431         "oline":    "\u203E",
1432         "frasl":    "\u2044",
1433         "weierp":   "\u2118",
1434         "image":    "\u2111",
1435         "real":     "\u211C",
1436         "trade":    "\u2122",
1437         "alefsym":  "\u2135",
1438         "larr":     "\u2190",
1439         "uarr":     "\u2191",
1440         "rarr":     "\u2192",
1441         "darr":     "\u2193",
1442         "harr":     "\u2194",
1443         "crarr":    "\u21B5",
1444         "lArr":     "\u21D0",
1445         "uArr":     "\u21D1",
1446         "rArr":     "\u21D2",
1447         "dArr":     "\u21D3",
1448         "hArr":     "\u21D4",
1449         "forall":   "\u2200",
1450         "part":     "\u2202",
1451         "exist":    "\u2203",
1452         "empty":    "\u2205",
1453         "nabla":    "\u2207",
1454         "isin":     "\u2208",
1455         "notin":    "\u2209",
1456         "ni":       "\u220B",
1457         "prod":     "\u220F",
1458         "sum":      "\u2211",
1459         "minus":    "\u2212",
1460         "lowast":   "\u2217",
1461         "radic":    "\u221A",
1462         "prop":     "\u221D",
1463         "infin":    "\u221E",
1464         "ang":      "\u2220",
1465         "and":      "\u2227",
1466         "or":       "\u2228",
1467         "cap":      "\u2229",
1468         "cup":      "\u222A",
1469         "int":      "\u222B",
1470         "there4":   "\u2234",
1471         "sim":      "\u223C",
1472         "cong":     "\u2245",
1473         "asymp":    "\u2248",
1474         "ne":       "\u2260",
1475         "equiv":    "\u2261",
1476         "le":       "\u2264",
1477         "ge":       "\u2265",
1478         "sub":      "\u2282",
1479         "sup":      "\u2283",
1480         "nsub":     "\u2284",
1481         "sube":     "\u2286",
1482         "supe":     "\u2287",
1483         "oplus":    "\u2295",
1484         "otimes":   "\u2297",
1485         "perp":     "\u22A5",
1486         "sdot":     "\u22C5",
1487         "lceil":    "\u2308",
1488         "rceil":    "\u2309",
1489         "lfloor":   "\u230A",
1490         "rfloor":   "\u230B",
1491         "lang":     "\u2329",
1492         "rang":     "\u232A",
1493         "loz":      "\u25CA",
1494         "spades":   "\u2660",
1495         "clubs":    "\u2663",
1496         "hearts":   "\u2665",
1497         "diams":    "\u2666",
1498         "quot":     "\u0022",
1499         "amp":      "\u0026",
1500         "lt":       "\u003C",
1501         "gt":       "\u003E",
1502         "OElig":    "\u0152",
1503         "oelig":    "\u0153",
1504         "Scaron":   "\u0160",
1505         "scaron":   "\u0161",
1506         "Yuml":     "\u0178",
1507         "circ":     "\u02C6",
1508         "tilde":    "\u02DC",
1509         "ensp":     "\u2002",
1510         "emsp":     "\u2003",
1511         "thinsp":   "\u2009",
1512         "zwnj":     "\u200C",
1513         "zwj":      "\u200D",
1514         "lrm":      "\u200E",
1515         "rlm":      "\u200F",
1516         "ndash":    "\u2013",
1517         "mdash":    "\u2014",
1518         "lsquo":    "\u2018",
1519         "rsquo":    "\u2019",
1520         "sbquo":    "\u201A",
1521         "ldquo":    "\u201C",
1522         "rdquo":    "\u201D",
1523         "bdquo":    "\u201E",
1524         "dagger":   "\u2020",
1525         "Dagger":   "\u2021",
1526         "permil":   "\u2030",
1527         "lsaquo":   "\u2039",
1528         "rsaquo":   "\u203A",
1529         "euro":     "\u20AC",
1530 }
1531
1532 // HTMLAutoClose is the set of HTML elements that
1533 // should be considered to close automatically.
1534 var HTMLAutoClose = htmlAutoClose
1535
1536 var htmlAutoClose = []string{
1537         /*
1538                 hget http://www.w3.org/TR/html4/loose.dtd |
1539                 9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/ "\1",/p' | tr A-Z a-z
1540         */
1541         "basefont",
1542         "br",
1543         "area",
1544         "link",
1545         "img",
1546         "param",
1547         "hr",
1548         "input",
1549         "col     ",
1550         "frame",
1551         "isindex",
1552         "base",
1553         "meta",
1554 }
1555
1556 var (
1557         esc_quot = []byte("&#34;") // shorter than "&quot;"
1558         esc_apos = []byte("&#39;") // shorter than "&apos;"
1559         esc_amp  = []byte("&amp;")
1560         esc_lt   = []byte("&lt;")
1561         esc_gt   = []byte("&gt;")
1562 )
1563
1564 // Escape writes to w the properly escaped XML equivalent
1565 // of the plain text data s.
1566 func Escape(w io.Writer, s []byte) {
1567         var esc []byte
1568         last := 0
1569         for i, c := range s {
1570                 switch c {
1571                 case '"':
1572                         esc = esc_quot
1573                 case '\'':
1574                         esc = esc_apos
1575                 case '&':
1576                         esc = esc_amp
1577                 case '<':
1578                         esc = esc_lt
1579                 case '>':
1580                         esc = esc_gt
1581                 default:
1582                         continue
1583                 }
1584                 w.Write(s[last:i])
1585                 w.Write(esc)
1586                 last = i + 1
1587         }
1588         w.Write(s[last:])
1589 }