OSDN Git Service

Remove the types float and complex.
[pf3gnuchains/gcc-fork.git] / libgo / go / html / escape.go
index f30086f..2799f69 100644 (file)
@@ -10,16 +10,118 @@ import (
        "utf8"
 )
 
+// These replacements permit compatibility with old numeric entities that 
+// assumed Windows-1252 encoding.
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
+var replacementTable = [...]int{
+       '\u20AC', // First entry is what 0x80 should be replaced with.
+       '\u0081',
+       '\u201A',
+       '\u0192',
+       '\u201E',
+       '\u2026',
+       '\u2020',
+       '\u2021',
+       '\u02C6',
+       '\u2030',
+       '\u0160',
+       '\u2039',
+       '\u0152',
+       '\u008D',
+       '\u017D',
+       '\u008F',
+       '\u0090',
+       '\u2018',
+       '\u2019',
+       '\u201C',
+       '\u201D',
+       '\u2022',
+       '\u2013',
+       '\u2014',
+       '\u02DC',
+       '\u2122',
+       '\u0161',
+       '\u203A',
+       '\u0153',
+       '\u009D',
+       '\u017E',
+       '\u0178', // Last entry is 0x9F.
+       // 0x00->'\uFFFD' is handled programmatically. 
+       // 0x0D->'\u000D' is a no-op.
+}
+
 // unescapeEntity reads an entity like "<" from b[src:] and writes the
 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
-// Precondition: src[0] == '&' && dst <= src.
+// Precondition: b[src] == '&' && dst <= src.
 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
-       // TODO(nigeltao): Check that this entity substitution algorithm matches the spec:
        // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
-       // TODO(nigeltao): Handle things like "&#20013;" or "&#x4e2d;".
 
        // i starts at 1 because we already know that s[0] == '&'.
        i, s := 1, b[src:]
+
+       if len(s) <= 1 {
+               b[dst] = b[src]
+               return dst + 1, src + 1
+       }
+
+       if s[i] == '#' {
+               if len(s) <= 3 { // We need to have at least "&#.".
+                       b[dst] = b[src]
+                       return dst + 1, src + 1
+               }
+               i++
+               c := s[i]
+               hex := false
+               if c == 'x' || c == 'X' {
+                       hex = true
+                       i++
+               }
+
+               x := 0
+               for i < len(s) {
+                       c = s[i]
+                       i++
+                       if hex {
+                               if '0' <= c && c <= '9' {
+                                       x = 16*x + int(c) - '0'
+                                       continue
+                               } else if 'a' <= c && c <= 'f' {
+                                       x = 16*x + int(c) - 'a' + 10
+                                       continue
+                               } else if 'A' <= c && c <= 'F' {
+                                       x = 16*x + int(c) - 'A' + 10
+                                       continue
+                               }
+                       } else if '0' <= c && c <= '9' {
+                               x = 10*x + int(c) - '0'
+                               continue
+                       }
+                       if c != ';' {
+                               i--
+                       }
+                       break
+               }
+
+               if i <= 3 { // No characters matched.
+                       b[dst] = b[src]
+                       return dst + 1, src + 1
+               }
+
+               if 0x80 <= x && x <= 0x9F {
+                       // Replace characters from Windows-1252 with UTF-8 equivalents.
+                       x = replacementTable[x-0x80]
+               } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
+                       // Replace invalid characters with the replacement character.
+                       x = '\uFFFD'
+               }
+
+               return dst + utf8.EncodeRune(b[dst:], x), src + i
+       }
+
+       // Consume the maximum number of characters possible, with the
+       // consumed characters matching one of the named references.
+
+       // TODO(nigeltao): unescape("&notit;") should be "¬it;"
        for i < len(s) {
                c := s[i]
                i++
@@ -30,12 +132,17 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
                if c != ';' {
                        i--
                }
-               x := entity[string(s[1:i])]
-               if x != 0 {
-                       return dst + utf8.EncodeRune(x, b[dst:]), src + i
-               }
                break
        }
+
+       entityName := string(s[1:i])
+       if x := entity[entityName]; x != 0 {
+               return dst + utf8.EncodeRune(b[dst:], x), src + i
+       } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
+               dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
+               return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
+       }
+
        dst1, src1 = dst+i, src+i
        copy(b[dst:dst1], b[src:src1])
        return dst1, src1