libgo/go/text/template/parse/lex.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package parse
   6
   7 import (
   8         "fmt"
   9         "strings"
  10         "unicode"
  11         "unicode/utf8"
  12 )
  13
  14 // item represents a token or text string returned from the scanner.
  15 type item struct {
  16         typ itemType
  17         val string
  18 }
  19
  20 func (i item) String() string {
  21         switch {
  22         case i.typ == itemEOF:
  23                 return "EOF"
  24         case i.typ == itemError:
  25                 return i.val
  26         case i.typ > itemKeyword:
  27                 return fmt.Sprintf("<%s>", i.val)
  28         case len(i.val) > 10:
  29                 return fmt.Sprintf("%.10q...", i.val)
  30         }
  31         return fmt.Sprintf("%q", i.val)
  32 }
  33
  34 // itemType identifies the type of lex items.
  35 type itemType int
  36
  37 const (
  38         itemError        itemType = iota // error occurred; value is text of error
  39         itemBool                         // boolean constant
  40         itemChar                         // printable ASCII character; grab bag for comma etc.
  41         itemCharConstant                 // character constant
  42         itemComplex                      // complex constant (1+2i); imaginary is just a number
  43         itemColonEquals                  // colon-equals (':=') introducing a declaration
  44         itemEOF
  45         itemField      // alphanumeric identifier, starting with '.', possibly chained ('.x.y')
  46         itemIdentifier // alphanumeric identifier
  47         itemLeftDelim  // left action delimiter
  48         itemNumber     // simple number, including imaginary
  49         itemPipe       // pipe symbol
  50         itemRawString  // raw quoted string (includes quotes)
  51         itemRightDelim // right action delimiter
  52         itemString     // quoted string (includes quotes)
  53         itemText       // plain text
  54         itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'.
  55         // Keywords appear after all the rest.
  56         itemKeyword  // used only to delimit the keywords
  57         itemDot      // the cursor, spelled '.'.
  58         itemDefine   // define keyword
  59         itemElse     // else keyword
  60         itemEnd      // end keyword
  61         itemIf       // if keyword
  62         itemRange    // range keyword
  63         itemTemplate // template keyword
  64         itemWith     // with keyword
  65 )
  66
  67 // Make the types prettyprint.
  68 var itemName = map[itemType]string{
  69         itemError:        "error",
  70         itemBool:         "bool",
  71         itemChar:         "char",
  72         itemCharConstant: "charconst",
  73         itemComplex:      "complex",
  74         itemColonEquals:  ":=",
  75         itemEOF:          "EOF",
  76         itemField:        "field",
  77         itemIdentifier:   "identifier",
  78         itemLeftDelim:    "left delim",
  79         itemNumber:       "number",
  80         itemPipe:         "pipe",
  81         itemRawString:    "raw string",
  82         itemRightDelim:   "right delim",
  83         itemString:       "string",
  84         itemVariable:     "variable",
  85         // keywords
  86         itemDot:      ".",
  87         itemDefine:   "define",
  88         itemElse:     "else",
  89         itemIf:       "if",
  90         itemEnd:      "end",
  91         itemRange:    "range",
  92         itemTemplate: "template",
  93         itemWith:     "with",
  94 }
  95
  96 func (i itemType) String() string {
  97         s := itemName[i]
  98         if s == "" {
  99                 return fmt.Sprintf("item%d", int(i))
 100         }
 101         return s
 102 }
 103
 104 var key = map[string]itemType{
 105         ".":        itemDot,
 106         "define":   itemDefine,
 107         "else":     itemElse,
 108         "end":      itemEnd,
 109         "if":       itemIf,
 110         "range":    itemRange,
 111         "template": itemTemplate,
 112         "with":     itemWith,
 113 }
 114
 115 const eof = -1
 116
 117 // stateFn represents the state of the scanner as a function that returns the next state.
 118 type stateFn func(*lexer) stateFn
 119
 120 // lexer holds the state of the scanner.
 121 type lexer struct {
 122         name       string    // the name of the input; used only for error reports.
 123         input      string    // the string being scanned.
 124         leftDelim  string    // start of action.
 125         rightDelim string    // end of action.
 126         state      stateFn   // the next lexing function to enter.
 127         pos        int       // current position in the input.
 128         start      int       // start position of this item.
 129         width      int       // width of last rune read from input.
 130         items      chan item // channel of scanned items.
 131 }
 132
 133 // next returns the next rune in the input.
 134 func (l *lexer) next() (r rune) {
 135         if l.pos >= len(l.input) {
 136                 l.width = 0
 137                 return eof
 138         }
 139         r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
 140         l.pos += l.width
 141         return r
 142 }
 143
 144 // peek returns but does not consume the next rune in the input.
 145 func (l *lexer) peek() rune {
 146         r := l.next()
 147         l.backup()
 148         return r
 149 }
 150
 151 // backup steps back one rune. Can only be called once per call of next.
 152 func (l *lexer) backup() {
 153         l.pos -= l.width
 154 }
 155
 156 // emit passes an item back to the client.
 157 func (l *lexer) emit(t itemType) {
 158         l.items <- item{t, l.input[l.start:l.pos]}
 159         l.start = l.pos
 160 }
 161
 162 // ignore skips over the pending input before this point.
 163 func (l *lexer) ignore() {
 164         l.start = l.pos
 165 }
 166
 167 // accept consumes the next rune if it's from the valid set.
 168 func (l *lexer) accept(valid string) bool {
 169         if strings.IndexRune(valid, l.next()) >= 0 {
 170                 return true
 171         }
 172         l.backup()
 173         return false
 174 }
 175
 176 // acceptRun consumes a run of runes from the valid set.
 177 func (l *lexer) acceptRun(valid string) {
 178         for strings.IndexRune(valid, l.next()) >= 0 {
 179         }
 180         l.backup()
 181 }
 182
 183 // lineNumber reports which line we're on. Doing it this way
 184 // means we don't have to worry about peek double counting.
 185 func (l *lexer) lineNumber() int {
 186         return 1 + strings.Count(l.input[:l.pos], "\n")
 187 }
 188
 189 // error returns an error token and terminates the scan by passing
 190 // back a nil pointer that will be the next state, terminating l.nextItem.
 191 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
 192         l.items <- item{itemError, fmt.Sprintf(format, args...)}
 193         return nil
 194 }
 195
 196 // nextItem returns the next item from the input.
 197 func (l *lexer) nextItem() item {
 198         for {
 199                 select {
 200                 case item := <-l.items:
 201                         return item
 202                 default:
 203                         l.state = l.state(l)
 204                 }
 205         }
 206         panic("not reached")
 207 }
 208
 209 // lex creates a new scanner for the input string.
 210 func lex(name, input, left, right string) *lexer {
 211         if left == "" {
 212                 left = leftDelim
 213         }
 214         if right == "" {
 215                 right = rightDelim
 216         }
 217         l := &lexer{
 218                 name:       name,
 219                 input:      input,
 220                 leftDelim:  left,
 221                 rightDelim: right,
 222                 state:      lexText,
 223                 items:      make(chan item, 2), // Two items of buffering is sufficient for all state functions
 224         }
 225         return l
 226 }
 227
 228 // state functions
 229
 230 const (
 231         leftDelim    = "{{"
 232         rightDelim   = "}}"
 233         leftComment  = "/*"
 234         rightComment = "*/"
 235 )
 236
 237 // lexText scans until an opening action delimiter, "{{".
 238 func lexText(l *lexer) stateFn {
 239         for {
 240                 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
 241                         if l.pos > l.start {
 242                                 l.emit(itemText)
 243                         }
 244                         return lexLeftDelim
 245                 }
 246                 if l.next() == eof {
 247                         break
 248                 }
 249         }
 250         // Correctly reached EOF.
 251         if l.pos > l.start {
 252                 l.emit(itemText)
 253         }
 254         l.emit(itemEOF)
 255         return nil
 256 }
 257
 258 // lexLeftDelim scans the left delimiter, which is known to be present.
 259 func lexLeftDelim(l *lexer) stateFn {
 260         if strings.HasPrefix(l.input[l.pos:], l.leftDelim+leftComment) {
 261                 return lexComment
 262         }
 263         l.pos += len(l.leftDelim)
 264         l.emit(itemLeftDelim)
 265         return lexInsideAction
 266 }
 267
 268 // lexComment scans a comment. The left comment marker is known to be present.
 269 func lexComment(l *lexer) stateFn {
 270         i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim)
 271         if i < 0 {
 272                 return l.errorf("unclosed comment")
 273         }
 274         l.pos += i + len(rightComment) + len(l.rightDelim)
 275         l.ignore()
 276         return lexText
 277 }
 278
 279 // lexRightDelim scans the right delimiter, which is known to be present.
 280 func lexRightDelim(l *lexer) stateFn {
 281         l.pos += len(l.rightDelim)
 282         l.emit(itemRightDelim)
 283         return lexText
 284 }
 285
 286 // lexInsideAction scans the elements inside action delimiters.
 287 func lexInsideAction(l *lexer) stateFn {
 288         // Either number, quoted string, or identifier.
 289         // Spaces separate and are ignored.
 290         // Pipe symbols separate and are emitted.
 291         if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
 292                 return lexRightDelim
 293         }
 294         switch r := l.next(); {
 295         case r == eof || r == '\n':
 296                 return l.errorf("unclosed action")
 297         case isSpace(r):
 298                 l.ignore()
 299         case r == ':':
 300                 if l.next() != '=' {
 301                         return l.errorf("expected :=")
 302                 }
 303                 l.emit(itemColonEquals)
 304         case r == '|':
 305                 l.emit(itemPipe)
 306         case r == '"':
 307                 return lexQuote
 308         case r == '`':
 309                 return lexRawQuote
 310         case r == '$':
 311                 return lexIdentifier
 312         case r == '\'':
 313                 return lexChar
 314         case r == '.':
 315                 // special look-ahead for ".field" so we don't break l.backup().
 316                 if l.pos < len(l.input) {
 317                         r := l.input[l.pos]
 318                         if r < '0' || '9' < r {
 319                                 return lexIdentifier // itemDot comes from the keyword table.
 320                         }
 321                 }
 322                 fallthrough // '.' can start a number.
 323         case r == '+' || r == '-' || ('0' <= r && r <= '9'):
 324                 l.backup()
 325                 return lexNumber
 326         case isAlphaNumeric(r):
 327                 l.backup()
 328                 return lexIdentifier
 329         case r <= unicode.MaxASCII && unicode.IsPrint(r):
 330                 l.emit(itemChar)
 331                 return lexInsideAction
 332         default:
 333                 return l.errorf("unrecognized character in action: %#U", r)
 334         }
 335         return lexInsideAction
 336 }
 337
 338 // lexIdentifier scans an alphanumeric or field.
 339 func lexIdentifier(l *lexer) stateFn {
 340 Loop:
 341         for {
 342                 switch r := l.next(); {
 343                 case isAlphaNumeric(r):
 344                         // absorb.
 345                 case r == '.' && (l.input[l.start] == '.' || l.input[l.start] == '$'):
 346                         // field chaining; absorb into one token.
 347                 default:
 348                         l.backup()
 349                         word := l.input[l.start:l.pos]
 350                         if !l.atTerminator() {
 351                                 return l.errorf("unexpected character %+U", r)
 352                         }
 353                         switch {
 354                         case key[word] > itemKeyword:
 355                                 l.emit(key[word])
 356                         case word[0] == '.':
 357                                 l.emit(itemField)
 358                         case word[0] == '$':
 359                                 l.emit(itemVariable)
 360                         case word == "true", word == "false":
 361                                 l.emit(itemBool)
 362                         default:
 363                                 l.emit(itemIdentifier)
 364                         }
 365                         break Loop
 366                 }
 367         }
 368         return lexInsideAction
 369 }
 370
 371 // atTerminator reports whether the input is at valid termination character to
 372 // appear after an identifier. Mostly to catch cases like "$x+2" not being
 373 // acceptable without a space, in case we decide one day to implement
 374 // arithmetic.
 375 func (l *lexer) atTerminator() bool {
 376         r := l.peek()
 377         if isSpace(r) {
 378                 return true
 379         }
 380         switch r {
 381         case eof, ',', '|', ':':
 382                 return true
 383         }
 384         // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
 385         // succeed but should fail) but only in extremely rare cases caused by willfully
 386         // bad choice of delimiter.
 387         if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
 388                 return true
 389         }
 390         return false
 391 }
 392
 393 // lexChar scans a character constant. The initial quote is already
 394 // scanned.  Syntax checking is done by the parse.
 395 func lexChar(l *lexer) stateFn {
 396 Loop:
 397         for {
 398                 switch l.next() {
 399                 case '\\':
 400                         if r := l.next(); r != eof && r != '\n' {
 401                                 break
 402                         }
 403                         fallthrough
 404                 case eof, '\n':
 405                         return l.errorf("unterminated character constant")
 406                 case '\'':
 407                         break Loop
 408                 }
 409         }
 410         l.emit(itemCharConstant)
 411         return lexInsideAction
 412 }
 413
 414 // lexNumber scans a number: decimal, octal, hex, float, or imaginary.  This
 415 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
 416 // and "089" - but when it's wrong the input is invalid and the parser (via
 417 // strconv) will notice.
 418 func lexNumber(l *lexer) stateFn {
 419         if !l.scanNumber() {
 420                 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 421         }
 422         if sign := l.peek(); sign == '+' || sign == '-' {
 423                 // Complex: 1+2i.  No spaces, must end in 'i'.
 424                 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
 425                         return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 426                 }
 427                 l.emit(itemComplex)
 428         } else {
 429                 l.emit(itemNumber)
 430         }
 431         return lexInsideAction
 432 }
 433
 434 func (l *lexer) scanNumber() bool {
 435         // Optional leading sign.
 436         l.accept("+-")
 437         // Is it hex?
 438         digits := "0123456789"
 439         if l.accept("0") && l.accept("xX") {
 440                 digits = "0123456789abcdefABCDEF"
 441         }
 442         l.acceptRun(digits)
 443         if l.accept(".") {
 444                 l.acceptRun(digits)
 445         }
 446         if l.accept("eE") {
 447                 l.accept("+-")
 448                 l.acceptRun("0123456789")
 449         }
 450         // Is it imaginary?
 451         l.accept("i")
 452         // Next thing mustn't be alphanumeric.
 453         if isAlphaNumeric(l.peek()) {
 454                 l.next()
 455                 return false
 456         }
 457         return true
 458 }
 459
 460 // lexQuote scans a quoted string.
 461 func lexQuote(l *lexer) stateFn {
 462 Loop:
 463         for {
 464                 switch l.next() {
 465                 case '\\':
 466                         if r := l.next(); r != eof && r != '\n' {
 467                                 break
 468                         }
 469                         fallthrough
 470                 case eof, '\n':
 471                         return l.errorf("unterminated quoted string")
 472                 case '"':
 473                         break Loop
 474                 }
 475         }
 476         l.emit(itemString)
 477         return lexInsideAction
 478 }
 479
 480 // lexRawQuote scans a raw quoted string.
 481 func lexRawQuote(l *lexer) stateFn {
 482 Loop:
 483         for {
 484                 switch l.next() {
 485                 case eof, '\n':
 486                         return l.errorf("unterminated raw quoted string")
 487                 case '`':
 488                         break Loop
 489                 }
 490         }
 491         l.emit(itemRawString)
 492         return lexInsideAction
 493 }
 494
 495 // isSpace reports whether r is a space character.
 496 func isSpace(r rune) bool {
 497         switch r {
 498         case ' ', '\t', '\n', '\r':
 499                 return true
 500         }
 501         return false
 502 }
 503
 504 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
 505 func isAlphaNumeric(r rune) bool {
 506         return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
 507 }