OSDN Git Service

Remove the types float and complex.
[pf3gnuchains/gcc-fork.git] / libgo / go / html / escape.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package html
6
7 import (
8         "bytes"
9         "strings"
10         "utf8"
11 )
12
13 // These replacements permit compatibility with old numeric entities that 
14 // assumed Windows-1252 encoding.
15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
16 var replacementTable = [...]int{
17         '\u20AC', // First entry is what 0x80 should be replaced with.
18         '\u0081',
19         '\u201A',
20         '\u0192',
21         '\u201E',
22         '\u2026',
23         '\u2020',
24         '\u2021',
25         '\u02C6',
26         '\u2030',
27         '\u0160',
28         '\u2039',
29         '\u0152',
30         '\u008D',
31         '\u017D',
32         '\u008F',
33         '\u0090',
34         '\u2018',
35         '\u2019',
36         '\u201C',
37         '\u201D',
38         '\u2022',
39         '\u2013',
40         '\u2014',
41         '\u02DC',
42         '\u2122',
43         '\u0161',
44         '\u203A',
45         '\u0153',
46         '\u009D',
47         '\u017E',
48         '\u0178', // Last entry is 0x9F.
49         // 0x00->'\uFFFD' is handled programmatically. 
50         // 0x0D->'\u000D' is a no-op.
51 }
52
53 // unescapeEntity reads an entity like "<" from b[src:] and writes the
54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
55 // Precondition: b[src] == '&' && dst <= src.
56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
57         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
58
59         // i starts at 1 because we already know that s[0] == '&'.
60         i, s := 1, b[src:]
61
62         if len(s) <= 1 {
63                 b[dst] = b[src]
64                 return dst + 1, src + 1
65         }
66
67         if s[i] == '#' {
68                 if len(s) <= 3 { // We need to have at least "&#.".
69                         b[dst] = b[src]
70                         return dst + 1, src + 1
71                 }
72                 i++
73                 c := s[i]
74                 hex := false
75                 if c == 'x' || c == 'X' {
76                         hex = true
77                         i++
78                 }
79
80                 x := 0
81                 for i < len(s) {
82                         c = s[i]
83                         i++
84                         if hex {
85                                 if '0' <= c && c <= '9' {
86                                         x = 16*x + int(c) - '0'
87                                         continue
88                                 } else if 'a' <= c && c <= 'f' {
89                                         x = 16*x + int(c) - 'a' + 10
90                                         continue
91                                 } else if 'A' <= c && c <= 'F' {
92                                         x = 16*x + int(c) - 'A' + 10
93                                         continue
94                                 }
95                         } else if '0' <= c && c <= '9' {
96                                 x = 10*x + int(c) - '0'
97                                 continue
98                         }
99                         if c != ';' {
100                                 i--
101                         }
102                         break
103                 }
104
105                 if i <= 3 { // No characters matched.
106                         b[dst] = b[src]
107                         return dst + 1, src + 1
108                 }
109
110                 if 0x80 <= x && x <= 0x9F {
111                         // Replace characters from Windows-1252 with UTF-8 equivalents.
112                         x = replacementTable[x-0x80]
113                 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
114                         // Replace invalid characters with the replacement character.
115                         x = '\uFFFD'
116                 }
117
118                 return dst + utf8.EncodeRune(b[dst:], x), src + i
119         }
120
121         // Consume the maximum number of characters possible, with the
122         // consumed characters matching one of the named references.
123
124         // TODO(nigeltao): unescape("&notit;") should be "¬it;"
125         for i < len(s) {
126                 c := s[i]
127                 i++
128                 // Lower-cased characters are more common in entities, so we check for them first.
129                 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
130                         continue
131                 }
132                 if c != ';' {
133                         i--
134                 }
135                 break
136         }
137
138         entityName := string(s[1:i])
139         if x := entity[entityName]; x != 0 {
140                 return dst + utf8.EncodeRune(b[dst:], x), src + i
141         } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
142                 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
143                 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
144         }
145
146         dst1, src1 = dst+i, src+i
147         copy(b[dst:dst1], b[src:src1])
148         return dst1, src1
149 }
150
151 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
152 func unescape(b []byte) []byte {
153         for i, c := range b {
154                 if c == '&' {
155                         dst, src := unescapeEntity(b, i, i)
156                         for src < len(b) {
157                                 c := b[src]
158                                 if c == '&' {
159                                         dst, src = unescapeEntity(b, dst, src)
160                                 } else {
161                                         b[dst] = c
162                                         dst, src = dst+1, src+1
163                                 }
164                         }
165                         return b[0:dst]
166                 }
167         }
168         return b
169 }
170
171 const escapedChars = `&'<>"`
172
173 func escape(buf *bytes.Buffer, s string) {
174         i := strings.IndexAny(s, escapedChars)
175         for i != -1 {
176                 buf.WriteString(s[0:i])
177                 var esc string
178                 switch s[i] {
179                 case '&':
180                         esc = "&amp;"
181                 case '\'':
182                         esc = "&apos;"
183                 case '<':
184                         esc = "&lt;"
185                 case '>':
186                         esc = "&gt;"
187                 case '"':
188                         esc = "&quot;"
189                 default:
190                         panic("unrecognized escape character")
191                 }
192                 s = s[i+1:]
193                 buf.WriteString(esc)
194                 i = strings.IndexAny(s, escapedChars)
195         }
196         buf.WriteString(s)
197 }
198
199 // EscapeString escapes special characters like "<" to become "&lt;". It
200 // escapes only five such characters: amp, apos, lt, gt and quot.
201 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
202 // always true.
203 func EscapeString(s string) string {
204         if strings.IndexAny(s, escapedChars) == -1 {
205                 return s
206         }
207         buf := bytes.NewBuffer(nil)
208         escape(buf, s)
209         return buf.String()
210 }
211
212 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
213 // larger range of entities than EscapeString escapes. For example, "&aacute;"
214 // unescapes to "á", as does "&#225;" and "&xE1;".
215 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
216 // always true.
217 func UnescapeString(s string) string {
218         for _, c := range s {
219                 if c == '&' {
220                         return string(unescape([]byte(s)))
221                 }
222         }
223         return s
224 }