libgo/go/html/escape.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "strings"
  10         "utf8"
  11 )
  12
  13 // These replacements permit compatibility with old numeric entities that
  14 // assumed Windows-1252 encoding.
  15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
  16 var replacementTable = [...]int{
  17         '\u20AC', // First entry is what 0x80 should be replaced with.
  18         '\u0081',
  19         '\u201A',
  20         '\u0192',
  21         '\u201E',
  22         '\u2026',
  23         '\u2020',
  24         '\u2021',
  25         '\u02C6',
  26         '\u2030',
  27         '\u0160',
  28         '\u2039',
  29         '\u0152',
  30         '\u008D',
  31         '\u017D',
  32         '\u008F',
  33         '\u0090',
  34         '\u2018',
  35         '\u2019',
  36         '\u201C',
  37         '\u201D',
  38         '\u2022',
  39         '\u2013',
  40         '\u2014',
  41         '\u02DC',
  42         '\u2122',
  43         '\u0161',
  44         '\u203A',
  45         '\u0153',
  46         '\u009D',
  47         '\u017E',
  48         '\u0178', // Last entry is 0x9F.
  49         // 0x00->'\uFFFD' is handled programmatically.
  50         // 0x0D->'\u000D' is a no-op.
  51 }
  52
  53 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
  54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
  55 // Precondition: b[src] == '&' && dst <= src.
  56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
  57         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
  58
  59         // i starts at 1 because we already know that s[0] == '&'.
  60         i, s := 1, b[src:]
  61
  62         if len(s) <= 1 {
  63                 b[dst] = b[src]
  64                 return dst + 1, src + 1
  65         }
  66
  67         if s[i] == '#' {
  68                 if len(s) <= 3 { // We need to have at least "&#.".
  69                         b[dst] = b[src]
  70                         return dst + 1, src + 1
  71                 }
  72                 i++
  73                 c := s[i]
  74                 hex := false
  75                 if c == 'x' || c == 'X' {
  76                         hex = true
  77                         i++
  78                 }
  79
  80                 x := 0
  81                 for i < len(s) {
  82                         c = s[i]
  83                         i++
  84                         if hex {
  85                                 if '0' <= c && c <= '9' {
  86                                         x = 16*x + int(c) - '0'
  87                                         continue
  88                                 } else if 'a' <= c && c <= 'f' {
  89                                         x = 16*x + int(c) - 'a' + 10
  90                                         continue
  91                                 } else if 'A' <= c && c <= 'F' {
  92                                         x = 16*x + int(c) - 'A' + 10
  93                                         continue
  94                                 }
  95                         } else if '0' <= c && c <= '9' {
  96                                 x = 10*x + int(c) - '0'
  97                                 continue
  98                         }
  99                         if c != ';' {
 100                                 i--
 101                         }
 102                         break
 103                 }
 104
 105                 if i <= 3 { // No characters matched.
 106                         b[dst] = b[src]
 107                         return dst + 1, src + 1
 108                 }
 109
 110                 if 0x80 <= x && x <= 0x9F {
 111                         // Replace characters from Windows-1252 with UTF-8 equivalents.
 112                         x = replacementTable[x-0x80]
 113                 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
 114                         // Replace invalid characters with the replacement character.
 115                         x = '\uFFFD'
 116                 }
 117
 118                 return dst + utf8.EncodeRune(b[dst:], x), src + i
 119         }
 120
 121         // Consume the maximum number of characters possible, with the
 122         // consumed characters matching one of the named references.
 123
 124         // TODO(nigeltao): unescape("&notit;") should be "¬it;"
 125         for i < len(s) {
 126                 c := s[i]
 127                 i++
 128                 // Lower-cased characters are more common in entities, so we check for them first.
 129                 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
 130                         continue
 131                 }
 132                 if c != ';' {
 133                         i--
 134                 }
 135                 break
 136         }
 137
 138         entityName := string(s[1:i])
 139         if x := entity[entityName]; x != 0 {
 140                 return dst + utf8.EncodeRune(b[dst:], x), src + i
 141         } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
 142                 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
 143                 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
 144         }
 145
 146         dst1, src1 = dst+i, src+i
 147         copy(b[dst:dst1], b[src:src1])
 148         return dst1, src1
 149 }
 150
 151 // unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
 152 func unescape(b []byte) []byte {
 153         for i, c := range b {
 154                 if c == '&' {
 155                         dst, src := unescapeEntity(b, i, i)
 156                         for src < len(b) {
 157                                 c := b[src]
 158                                 if c == '&' {
 159                                         dst, src = unescapeEntity(b, dst, src)
 160                                 } else {
 161                                         b[dst] = c
 162                                         dst, src = dst+1, src+1
 163                                 }
 164                         }
 165                         return b[0:dst]
 166                 }
 167         }
 168         return b
 169 }
 170
 171 const escapedChars = `&'<>"`
 172
 173 func escape(buf *bytes.Buffer, s string) {
 174         i := strings.IndexAny(s, escapedChars)
 175         for i != -1 {
 176                 buf.WriteString(s[0:i])
 177                 var esc string
 178                 switch s[i] {
 179                 case '&':
 180                         esc = "&amp;"
 181                 case '\'':
 182                         esc = "&apos;"
 183                 case '<':
 184                         esc = "&lt;"
 185                 case '>':
 186                         esc = "&gt;"
 187                 case '"':
 188                         esc = "&quot;"
 189                 default:
 190                         panic("unrecognized escape character")
 191                 }
 192                 s = s[i+1:]
 193                 buf.WriteString(esc)
 194                 i = strings.IndexAny(s, escapedChars)
 195         }
 196         buf.WriteString(s)
 197 }
 198
 199 // EscapeString escapes special characters like "<" to become "&lt;". It
 200 // escapes only five such characters: amp, apos, lt, gt and quot.
 201 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 202 // always true.
 203 func EscapeString(s string) string {
 204         if strings.IndexAny(s, escapedChars) == -1 {
 205                 return s
 206         }
 207         buf := bytes.NewBuffer(nil)
 208         escape(buf, s)
 209         return buf.String()
 210 }
 211
 212 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
 213 // larger range of entities than EscapeString escapes. For example, "&aacute;"
 214 // unescapes to "á", as does "&#225;" and "&xE1;".
 215 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
 216 // always true.
 217 func UnescapeString(s string) string {
 218         for _, c := range s {
 219                 if c == '&' {
 220                         return string(unescape([]byte(s)))
 221                 }
 222         }
 223         return s
 224 }