libgo/go/exp/html/escape.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "strings"
  10         "unicode/utf8"
  11 )
  12
  13 // These replacements permit compatibility with old numeric entities that
  14 // assumed Windows-1252 encoding.
  15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
  16 var replacementTable = [...]rune{
  17         '\u20AC', // First entry is what 0x80 should be replaced with.
  18         '\u0081',
  19         '\u201A',
  20         '\u0192',
  21         '\u201E',
  22         '\u2026',
  23         '\u2020',
  24         '\u2021',
  25         '\u02C6',
  26         '\u2030',
  27         '\u0160',
  28         '\u2039',
  29         '\u0152',
  30         '\u008D',
  31         '\u017D',
  32         '\u008F',
  33         '\u0090',
  34         '\u2018',
  35         '\u2019',
  36         '\u201C',
  37         '\u201D',
  38         '\u2022',
  39         '\u2013',
  40         '\u2014',
  41         '\u02DC',
  42         '\u2122',
  43         '\u0161',
  44         '\u203A',
  45         '\u0153',
  46         '\u009D',
  47         '\u017E',
  48         '\u0178', // Last entry is 0x9F.
  49         // 0x00->'\uFFFD' is handled programmatically.
  50         // 0x0D->'\u000D' is a no-op.
  51 }
  52
  53 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
  54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
  55 // Precondition: b[src] == '&' && dst <= src.
  56 // attribute should be true if parsing an attribute value.
  57 func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
  58         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
  59
  60         // i starts at 1 because we already know that s[0] == '&'.
  61         i, s := 1, b[src:]
  62
  63         if len(s) <= 1 {
  64                 b[dst] = b[src]
  65                 return dst + 1, src + 1
  66         }
  67
  68         if s[i] == '#' {
  69                 if len(s) <= 3 { // We need to have at least "&#.".
  70                         b[dst] = b[src]
  71                         return dst + 1, src + 1
  72                 }
  73                 i++
  74                 c := s[i]
  75                 hex := false
  76                 if c == 'x' || c == 'X' {
  77                         hex = true
  78                         i++
  79                 }
  80
  81                 x := '\x00'
  82                 for i < len(s) {
  83                         c = s[i]
  84                         i++
  85                         if hex {
  86                                 if '0' <= c && c <= '9' {
  87                                         x = 16*x + rune(c) - '0'
  88                                         continue
  89                                 } else if 'a' <= c && c <= 'f' {
  90                                         x = 16*x + rune(c) - 'a' + 10
  91                                         continue
  92                                 } else if 'A' <= c && c <= 'F' {
  93                                         x = 16*x + rune(c) - 'A' + 10
  94                                         continue
  95                                 }
  96                         } else if '0' <= c && c <= '9' {
  97                                 x = 10*x + rune(c) - '0'
  98                                 continue
  99                         }
 100                         if c != ';' {
 101                                 i--
 102                         }
 103                         break
 104                 }
 105
 106                 if i <= 3 { // No characters matched.
 107                         b[dst] = b[src]
 108                         return dst + 1, src + 1
 109                 }
 110
 111                 if 0x80 <= x && x <= 0x9F {
 112                         // Replace characters from Windows-1252 with UTF-8 equivalents.
 113                         x = replacementTable[x-0x80]
 114                 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
 115                         // Replace invalid characters with the replacement character.
 116                         x = '\uFFFD'
 117                 }
 118
 119                 return dst + utf8.EncodeRune(b[dst:], x), src + i
 120         }
 121
 122         // Consume the maximum number of characters possible, with the
 123         // consumed characters matching one of the named references.
 124
 125         for i < len(s) {
 126                 c := s[i]
 127                 i++
 128                 // Lower-cased characters are more common in entities, so we check for them first.
 129                 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
 130                         continue
 131                 }
 132                 if c != ';' {
 133                         i--
 134                 }
 135                 break
 136         }
 137
 138         entityName := string(s[1:i])
 139         if entityName == "" {
 140                 // No-op.
 141         } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
 142                 // No-op.
 143         } else if x := entity[entityName]; x != 0 {
 144                 return dst + utf8.EncodeRune(b[dst:], x), src + i
 145         } else if x := entity2[entityName]; x[0] != 0 {
 146                 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 147                 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
 148         } else if !attribute {
 149                 maxLen := len(entityName) - 1
 150                 if maxLen > longestEntityWithoutSemicolon {
 151                         maxLen = longestEntityWithoutSemicolon
 152                 }
 153                 for j := maxLen; j > 1; j-- {
 154                         if x := entity[entityName[:j]]; x != 0 {
 155                                 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
 156                         }
 157                 }
 158         }
 159
 160         dst1, src1 = dst+i, src+i
 161         copy(b[dst:dst1], b[src:src1])
 162         return dst1, src1
 163 }
 164
 165 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
 166 func unescape(b []byte) []byte {
 167         for i, c := range b {
 168                 if c == '&' {
 169                         dst, src := unescapeEntity(b, i, i, false)
 170                         for src < len(b) {
 171                                 c := b[src]
 172                                 if c == '&' {
 173                                         dst, src = unescapeEntity(b, dst, src, false)
 174                                 } else {
 175                                         b[dst] = c
 176                                         dst, src = dst+1, src+1
 177                                 }
 178                         }
 179                         return b[0:dst]
 180                 }
 181         }
 182         return b
 183 }
 184
 185 // lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
 186 func lower(b []byte) []byte {
 187         for i, c := range b {
 188                 if 'A' <= c && c <= 'Z' {
 189                         b[i] = c + 'a' - 'A'
 190                 }
 191         }
 192         return b
 193 }
 194
 195 const escapedChars = `&'<>"`
 196
 197 func escape(w writer, s string) error {
 198         i := strings.IndexAny(s, escapedChars)
 199         for i != -1 {
 200                 if _, err := w.WriteString(s[:i]); err != nil {
 201                         return err
 202                 }
 203                 var esc string
 204                 switch s[i] {
 205                 case '&':
 206                         esc = "&amp;"
 207                 case '\'':
 208                         esc = "&apos;"
 209                 case '<':
 210                         esc = "&lt;"
 211                 case '>':
 212                         esc = "&gt;"
 213                 case '"':
 214                         esc = "&quot;"
 215                 default:
 216                         panic("unrecognized escape character")
 217                 }
 218                 s = s[i+1:]
 219                 if _, err := w.WriteString(esc); err != nil {
 220                         return err
 221                 }
 222                 i = strings.IndexAny(s, escapedChars)
 223         }
 224         _, err := w.WriteString(s)
 225         return err
 226 }
 227
 228 // EscapeString escapes special characters like "<" to become "&lt;". It
 229 // escapes only five such characters: amp, apos, lt, gt and quot.
 230 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 231 // always true.
 232 func EscapeString(s string) string {
 233         if strings.IndexAny(s, escapedChars) == -1 {
 234                 return s
 235         }
 236         var buf bytes.Buffer
 237         escape(&buf, s)
 238         return buf.String()
 239 }
 240
 241 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
 242 // larger range of entities than EscapeString escapes. For example, "&aacute;"
 243 // unescapes to "á", as does "&#225;" and "&xE1;".
 244 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 245 // always true.
 246 func UnescapeString(s string) string {
 247         for _, c := range s {
 248                 if c == '&' {
 249                         return string(unescape([]byte(s)))
 250                 }
 251         }
 252         return s
 253 }