libgo/go/exp/template/html/transition.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package html
   6
   7 import (
   8         "bytes"
   9         "strings"
  10 )
  11
  12 // transitionFunc is the array of context transition functions for text nodes.
  13 // A transition function takes a context and template text input, and returns
  14 // the updated context and the number of bytes consumed from the front of the
  15 // input.
  16 var transitionFunc = [...]func(context, []byte) (context, int){
  17         stateText:        tText,
  18         stateTag:         tTag,
  19         stateAttrName:    tAttrName,
  20         stateAfterName:   tAfterName,
  21         stateBeforeValue: tBeforeValue,
  22         stateHTMLCmt:     tHTMLCmt,
  23         stateRCDATA:      tSpecialTagEnd,
  24         stateAttr:        tAttr,
  25         stateURL:         tURL,
  26         stateJS:          tJS,
  27         stateJSDqStr:     tJSDelimited,
  28         stateJSSqStr:     tJSDelimited,
  29         stateJSRegexp:    tJSDelimited,
  30         stateJSBlockCmt:  tBlockCmt,
  31         stateJSLineCmt:   tLineCmt,
  32         stateCSS:         tCSS,
  33         stateCSSDqStr:    tCSSStr,
  34         stateCSSSqStr:    tCSSStr,
  35         stateCSSDqURL:    tCSSStr,
  36         stateCSSSqURL:    tCSSStr,
  37         stateCSSURL:      tCSSStr,
  38         stateCSSBlockCmt: tBlockCmt,
  39         stateCSSLineCmt:  tLineCmt,
  40         stateError:       tError,
  41 }
  42
  43 var commentStart = []byte("<!--")
  44 var commentEnd = []byte("-->")
  45
  46 // tText is the context transition function for the text state.
  47 func tText(c context, s []byte) (context, int) {
  48         k := 0
  49         for {
  50                 i := k + bytes.IndexByte(s[k:], '<')
  51                 if i < k || i+1 == len(s) {
  52                         return c, len(s)
  53                 } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
  54                         return context{state: stateHTMLCmt}, i + 4
  55                 }
  56                 i++
  57                 end := false
  58                 if s[i] == '/' {
  59                         if i+1 == len(s) {
  60                                 return c, len(s)
  61                         }
  62                         end, i = true, i+1
  63                 }
  64                 j, e := eatTagName(s, i)
  65                 if j != i {
  66                         if end {
  67                                 e = elementNone
  68                         }
  69                         // We've found an HTML tag.
  70                         return context{state: stateTag, element: e}, j
  71                 }
  72                 k = j
  73         }
  74         panic("unreachable")
  75 }
  76
  77 var elementContentType = [...]state{
  78         elementNone:     stateText,
  79         elementScript:   stateJS,
  80         elementStyle:    stateCSS,
  81         elementTextarea: stateRCDATA,
  82         elementTitle:    stateRCDATA,
  83 }
  84
  85 // tTag is the context transition function for the tag state.
  86 func tTag(c context, s []byte) (context, int) {
  87         // Find the attribute name.
  88         i := eatWhiteSpace(s, 0)
  89         if i == len(s) {
  90                 return c, len(s)
  91         }
  92         if s[i] == '>' {
  93                 return context{
  94                         state:   elementContentType[c.element],
  95                         element: c.element,
  96                 }, i + 1
  97         }
  98         j, err := eatAttrName(s, i)
  99         if err != nil {
 100                 return context{state: stateError, err: err}, len(s)
 101         }
 102         state, attr := stateTag, attrNone
 103         if i == j {
 104                 return context{
 105                         state: stateError,
 106                         err:   errorf(ErrBadHTML, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
 107                 }, len(s)
 108         }
 109         switch attrType(string(s[i:j])) {
 110         case contentTypeURL:
 111                 attr = attrURL
 112         case contentTypeCSS:
 113                 attr = attrStyle
 114         case contentTypeJS:
 115                 attr = attrScript
 116         }
 117         if j == len(s) {
 118                 state = stateAttrName
 119         } else {
 120                 state = stateAfterName
 121         }
 122         return context{state: state, element: c.element, attr: attr}, j
 123 }
 124
 125 // tAttrName is the context transition function for stateAttrName.
 126 func tAttrName(c context, s []byte) (context, int) {
 127         i, err := eatAttrName(s, 0)
 128         if err != nil {
 129                 return context{state: stateError, err: err}, len(s)
 130         } else if i != len(s) {
 131                 c.state = stateAfterName
 132         }
 133         return c, i
 134 }
 135
 136 // tAfterName is the context transition function for stateAfterName.
 137 func tAfterName(c context, s []byte) (context, int) {
 138         // Look for the start of the value.
 139         i := eatWhiteSpace(s, 0)
 140         if i == len(s) {
 141                 return c, len(s)
 142         } else if s[i] != '=' {
 143                 // Occurs due to tag ending '>', and valueless attribute.
 144                 c.state = stateTag
 145                 return c, i
 146         }
 147         c.state = stateBeforeValue
 148         // Consume the "=".
 149         return c, i + 1
 150 }
 151
 152 var attrStartStates = [...]state{
 153         attrNone:   stateAttr,
 154         attrScript: stateJS,
 155         attrStyle:  stateCSS,
 156         attrURL:    stateURL,
 157 }
 158
 159 // tBeforeValue is the context transition function for stateBeforeValue.
 160 func tBeforeValue(c context, s []byte) (context, int) {
 161         i := eatWhiteSpace(s, 0)
 162         if i == len(s) {
 163                 return c, len(s)
 164         }
 165         // Find the attribute delimiter.
 166         delim := delimSpaceOrTagEnd
 167         switch s[i] {
 168         case '\'':
 169                 delim, i = delimSingleQuote, i+1
 170         case '"':
 171                 delim, i = delimDoubleQuote, i+1
 172         }
 173         c.state, c.delim, c.attr = attrStartStates[c.attr], delim, attrNone
 174         return c, i
 175 }
 176
 177 // tHTMLCmt is the context transition function for stateHTMLCmt.
 178 func tHTMLCmt(c context, s []byte) (context, int) {
 179         if i := bytes.Index(s, commentEnd); i != -1 {
 180                 return context{}, i + 3
 181         }
 182         return c, len(s)
 183 }
 184
 185 // specialTagEndMarkers maps element types to the character sequence that
 186 // case-insensitively signals the end of the special tag body.
 187 var specialTagEndMarkers = [...]string{
 188         elementScript:   "</script",
 189         elementStyle:    "</style",
 190         elementTextarea: "</textarea",
 191         elementTitle:    "</title",
 192 }
 193
 194 // tSpecialTagEnd is the context transition function for raw text and RCDATA
 195 // element states.
 196 func tSpecialTagEnd(c context, s []byte) (context, int) {
 197         if c.element != elementNone {
 198                 if i := strings.Index(strings.ToLower(string(s)), specialTagEndMarkers[c.element]); i != -1 {
 199                         return context{}, i
 200                 }
 201         }
 202         return c, len(s)
 203 }
 204
 205 // tAttr is the context transition function for the attribute state.
 206 func tAttr(c context, s []byte) (context, int) {
 207         return c, len(s)
 208 }
 209
 210 // tURL is the context transition function for the URL state.
 211 func tURL(c context, s []byte) (context, int) {
 212         if bytes.IndexAny(s, "#?") >= 0 {
 213                 c.urlPart = urlPartQueryOrFrag
 214         } else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
 215                 // HTML5 uses "Valid URL potentially surrounded by spaces" for
 216                 // attrs: http://www.w3.org/TR/html5/index.html#attributes-1
 217                 c.urlPart = urlPartPreQuery
 218         }
 219         return c, len(s)
 220 }
 221
 222 // tJS is the context transition function for the JS state.
 223 func tJS(c context, s []byte) (context, int) {
 224         i := bytes.IndexAny(s, `"'/`)
 225         if i == -1 {
 226                 // Entire input is non string, comment, regexp tokens.
 227                 c.jsCtx = nextJSCtx(s, c.jsCtx)
 228                 return c, len(s)
 229         }
 230         c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
 231         switch s[i] {
 232         case '"':
 233                 c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
 234         case '\'':
 235                 c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
 236         case '/':
 237                 switch {
 238                 case i+1 < len(s) && s[i+1] == '/':
 239                         c.state, i = stateJSLineCmt, i+1
 240                 case i+1 < len(s) && s[i+1] == '*':
 241                         c.state, i = stateJSBlockCmt, i+1
 242                 case c.jsCtx == jsCtxRegexp:
 243                         c.state = stateJSRegexp
 244                 case c.jsCtx == jsCtxDivOp:
 245                         c.jsCtx = jsCtxRegexp
 246                 default:
 247                         return context{
 248                                 state: stateError,
 249                                 err:   errorf(ErrSlashAmbig, 0, "'/' could start a division or regexp: %.32q", s[i:]),
 250                         }, len(s)
 251                 }
 252         default:
 253                 panic("unreachable")
 254         }
 255         return c, i + 1
 256 }
 257
 258 // tJSDelimited is the context transition function for the JS string and regexp
 259 // states.
 260 func tJSDelimited(c context, s []byte) (context, int) {
 261         specials := `\"`
 262         switch c.state {
 263         case stateJSSqStr:
 264                 specials = `\'`
 265         case stateJSRegexp:
 266                 specials = `\/[]`
 267         }
 268
 269         k, inCharset := 0, false
 270         for {
 271                 i := k + bytes.IndexAny(s[k:], specials)
 272                 if i < k {
 273                         break
 274                 }
 275                 switch s[i] {
 276                 case '\\':
 277                         i++
 278                         if i == len(s) {
 279                                 return context{
 280                                         state: stateError,
 281                                         err:   errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS string: %q", s),
 282                                 }, len(s)
 283                         }
 284                 case '[':
 285                         inCharset = true
 286                 case ']':
 287                         inCharset = false
 288                 default:
 289                         // end delimiter
 290                         if !inCharset {
 291                                 c.state, c.jsCtx = stateJS, jsCtxDivOp
 292                                 return c, i + 1
 293                         }
 294                 }
 295                 k = i + 1
 296         }
 297
 298         if inCharset {
 299                 // This can be fixed by making context richer if interpolation
 300                 // into charsets is desired.
 301                 return context{
 302                         state: stateError,
 303                         err:   errorf(ErrPartialCharset, 0, "unfinished JS regexp charset: %q", s),
 304                 }, len(s)
 305         }
 306
 307         return c, len(s)
 308 }
 309
 310 var blockCommentEnd = []byte("*/")
 311
 312 // tBlockCmt is the context transition function for /*comment*/ states.
 313 func tBlockCmt(c context, s []byte) (context, int) {
 314         i := bytes.Index(s, blockCommentEnd)
 315         if i == -1 {
 316                 return c, len(s)
 317         }
 318         switch c.state {
 319         case stateJSBlockCmt:
 320                 c.state = stateJS
 321         case stateCSSBlockCmt:
 322                 c.state = stateCSS
 323         default:
 324                 panic(c.state.String())
 325         }
 326         return c, i + 2
 327 }
 328
 329 // tLineCmt is the context transition function for //comment states.
 330 func tLineCmt(c context, s []byte) (context, int) {
 331         var lineTerminators string
 332         var endState state
 333         switch c.state {
 334         case stateJSLineCmt:
 335                 lineTerminators, endState = "\n\r\u2028\u2029", stateJS
 336         case stateCSSLineCmt:
 337                 lineTerminators, endState = "\n\f\r", stateCSS
 338                 // Line comments are not part of any published CSS standard but
 339                 // are supported by the 4 major browsers.
 340                 // This defines line comments as
 341                 //     LINECOMMENT ::= "//" [^\n\f\d]*
 342                 // since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
 343                 // newlines:
 344                 //     nl ::= #xA | #xD #xA | #xD | #xC
 345         default:
 346                 panic(c.state.String())
 347         }
 348
 349         i := bytes.IndexAny(s, lineTerminators)
 350         if i == -1 {
 351                 return c, len(s)
 352         }
 353         c.state = endState
 354         // Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4
 355         // "However, the LineTerminator at the end of the line is not
 356         // considered to be part of the single-line comment; it is
 357         // recognized separately by the lexical grammar and becomes part
 358         // of the stream of input elements for the syntactic grammar."
 359         return c, i
 360 }
 361
 362 // tCSS is the context transition function for the CSS state.
 363 func tCSS(c context, s []byte) (context, int) {
 364         // CSS quoted strings are almost never used except for:
 365         // (1) URLs as in background: "/foo.png"
 366         // (2) Multiword font-names as in font-family: "Times New Roman"
 367         // (3) List separators in content values as in inline-lists:
 368         //    <style>
 369         //    ul.inlineList { list-style: none; padding:0 }
 370         //    ul.inlineList > li { display: inline }
 371         //    ul.inlineList > li:before { content: ", " }
 372         //    ul.inlineList > li:first-child:before { content: "" }
 373         //    </style>
 374         //    <ul class=inlineList><li>One<li>Two<li>Three</ul>
 375         // (4) Attribute value selectors as in a[href="http://example.com/"]
 376         //
 377         // We conservatively treat all strings as URLs, but make some
 378         // allowances to avoid confusion.
 379         //
 380         // In (1), our conservative assumption is justified.
 381         // In (2), valid font names do not contain ':', '?', or '#', so our
 382         // conservative assumption is fine since we will never transition past
 383         // urlPartPreQuery.
 384         // In (3), our protocol heuristic should not be tripped, and there
 385         // should not be non-space content after a '?' or '#', so as long as
 386         // we only %-encode RFC 3986 reserved characters we are ok.
 387         // In (4), we should URL escape for URL attributes, and for others we
 388         // have the attribute name available if our conservative assumption
 389         // proves problematic for real code.
 390
 391         k := 0
 392         for {
 393                 i := k + bytes.IndexAny(s[k:], `("'/`)
 394                 if i < k {
 395                         return c, len(s)
 396                 }
 397                 switch s[i] {
 398                 case '(':
 399                         // Look for url to the left.
 400                         p := bytes.TrimRight(s[:i], "\t\n\f\r ")
 401                         if endsWithCSSKeyword(p, "url") {
 402                                 j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
 403                                 switch {
 404                                 case j != len(s) && s[j] == '"':
 405                                         c.state, j = stateCSSDqURL, j+1
 406                                 case j != len(s) && s[j] == '\'':
 407                                         c.state, j = stateCSSSqURL, j+1
 408                                 default:
 409                                         c.state = stateCSSURL
 410                                 }
 411                                 return c, j
 412                         }
 413                 case '/':
 414                         if i+1 < len(s) {
 415                                 switch s[i+1] {
 416                                 case '/':
 417                                         c.state = stateCSSLineCmt
 418                                         return c, i + 2
 419                                 case '*':
 420                                         c.state = stateCSSBlockCmt
 421                                         return c, i + 2
 422                                 }
 423                         }
 424                 case '"':
 425                         c.state = stateCSSDqStr
 426                         return c, i + 1
 427                 case '\'':
 428                         c.state = stateCSSSqStr
 429                         return c, i + 1
 430                 }
 431                 k = i + 1
 432         }
 433         panic("unreachable")
 434 }
 435
 436 // tCSSStr is the context transition function for the CSS string and URL states.
 437 func tCSSStr(c context, s []byte) (context, int) {
 438         var endAndEsc string
 439         switch c.state {
 440         case stateCSSDqStr, stateCSSDqURL:
 441                 endAndEsc = `\"`
 442         case stateCSSSqStr, stateCSSSqURL:
 443                 endAndEsc = `\'`
 444         case stateCSSURL:
 445                 // Unquoted URLs end with a newline or close parenthesis.
 446                 // The below includes the wc (whitespace character) and nl.
 447                 endAndEsc = "\\\t\n\f\r )"
 448         default:
 449                 panic(c.state.String())
 450         }
 451
 452         k := 0
 453         for {
 454                 i := k + bytes.IndexAny(s[k:], endAndEsc)
 455                 if i < k {
 456                         c, nread := tURL(c, decodeCSS(s[k:]))
 457                         return c, k + nread
 458                 }
 459                 if s[i] == '\\' {
 460                         i++
 461                         if i == len(s) {
 462                                 return context{
 463                                         state: stateError,
 464                                         err:   errorf(ErrPartialEscape, 0, "unfinished escape sequence in CSS string: %q", s),
 465                                 }, len(s)
 466                         }
 467                 } else {
 468                         c.state = stateCSS
 469                         return c, i + 1
 470                 }
 471                 c, _ = tURL(c, decodeCSS(s[:i+1]))
 472                 k = i + 1
 473         }
 474         panic("unreachable")
 475 }
 476
 477 // tError is the context transition function for the error state.
 478 func tError(c context, s []byte) (context, int) {
 479         return c, len(s)
 480 }
 481
 482 // eatAttrName returns the largest j such that s[i:j] is an attribute name.
 483 // It returns an error if s[i:] does not look like it begins with an
 484 // attribute name, such as encountering a quote mark without a preceding
 485 // equals sign.
 486 func eatAttrName(s []byte, i int) (int, *Error) {
 487         for j := i; j < len(s); j++ {
 488                 switch s[j] {
 489                 case ' ', '\t', '\n', '\f', '\r', '=', '>':
 490                         return j, nil
 491                 case '\'', '"', '<':
 492                         // These result in a parse warning in HTML5 and are
 493                         // indicative of serious problems if seen in an attr
 494                         // name in a template.
 495                         return -1, errorf(ErrBadHTML, 0, "%q in attribute name: %.32q", s[j:j+1], s)
 496                 default:
 497                         // No-op.
 498                 }
 499         }
 500         return len(s), nil
 501 }
 502
 503 var elementNameMap = map[string]element{
 504         "script":   elementScript,
 505         "style":    elementStyle,
 506         "textarea": elementTextarea,
 507         "title":    elementTitle,
 508 }
 509
 510 // asciiAlpha returns whether c is an ASCII letter.
 511 func asciiAlpha(c byte) bool {
 512         return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
 513 }
 514
 515 // asciiAlphaNum returns whether c is an ASCII letter or digit.
 516 func asciiAlphaNum(c byte) bool {
 517         return asciiAlpha(c) || '0' <= c && c <= '9'
 518 }
 519
 520 // eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
 521 func eatTagName(s []byte, i int) (int, element) {
 522         if i == len(s) || !asciiAlpha(s[i]) {
 523                 return i, elementNone
 524         }
 525         j := i + 1
 526         for j < len(s) {
 527                 x := s[j]
 528                 if asciiAlphaNum(x) {
 529                         j++
 530                         continue
 531                 }
 532                 // Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
 533                 if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
 534                         j += 2
 535                         continue
 536                 }
 537                 break
 538         }
 539         return j, elementNameMap[strings.ToLower(string(s[i:j]))]
 540 }
 541
 542 // eatWhiteSpace returns the largest j such that s[i:j] is white space.
 543 func eatWhiteSpace(s []byte, i int) int {
 544         for j := i; j < len(s); j++ {
 545                 switch s[j] {
 546                 case ' ', '\t', '\n', '\f', '\r':
 547                         // No-op.
 548                 default:
 549                         return j
 550                 }
 551         }
 552         return len(s)
 553 }