libgo/go/text/template/parse/lex.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package parse
   6
   7 import (
   8         "fmt"
   9         "strings"
  10         "unicode"
  11         "unicode/utf8"
  12 )
  13
  14 // item represents a token or text string returned from the scanner.
  15 type item struct {
  16         typ itemType
  17         val string
  18 }
  19
  20 func (i item) String() string {
  21         switch {
  22         case i.typ == itemEOF:
  23                 return "EOF"
  24         case i.typ == itemError:
  25                 return i.val
  26         case i.typ > itemKeyword:
  27                 return fmt.Sprintf("<%s>", i.val)
  28         case len(i.val) > 10:
  29                 return fmt.Sprintf("%.10q...", i.val)
  30         }
  31         return fmt.Sprintf("%q", i.val)
  32 }
  33
  34 // itemType identifies the type of lex items.
  35 type itemType int
  36
  37 const (
  38         itemError        itemType = iota // error occurred; value is text of error
  39         itemBool                         // boolean constant
  40         itemChar                         // printable ASCII character; grab bag for comma etc.
  41         itemCharConstant                 // character constant
  42         itemComplex                      // complex constant (1+2i); imaginary is just a number
  43         itemColonEquals                  // colon-equals (':=') introducing a declaration
  44         itemEOF
  45         itemField      // alphanumeric identifier, starting with '.', possibly chained ('.x.y')
  46         itemIdentifier // alphanumeric identifier
  47         itemLeftDelim  // left action delimiter
  48         itemNumber     // simple number, including imaginary
  49         itemPipe       // pipe symbol
  50         itemRawString  // raw quoted string (includes quotes)
  51         itemRightDelim // right action delimiter
  52         itemString     // quoted string (includes quotes)
  53         itemText       // plain text
  54         itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'.
  55         // Keywords appear after all the rest.
  56         itemKeyword  // used only to delimit the keywords
  57         itemDot      // the cursor, spelled '.'.
  58         itemDefine   // define keyword
  59         itemElse     // else keyword
  60         itemEnd      // end keyword
  61         itemIf       // if keyword
  62         itemRange    // range keyword
  63         itemTemplate // template keyword
  64         itemWith     // with keyword
  65 )
  66
  67 // Make the types prettyprint.
  68 var itemName = map[itemType]string{
  69         itemError:        "error",
  70         itemBool:         "bool",
  71         itemChar:         "char",
  72         itemCharConstant: "charconst",
  73         itemComplex:      "complex",
  74         itemColonEquals:  ":=",
  75         itemEOF:          "EOF",
  76         itemField:        "field",
  77         itemIdentifier:   "identifier",
  78         itemLeftDelim:    "left delim",
  79         itemNumber:       "number",
  80         itemPipe:         "pipe",
  81         itemRawString:    "raw string",
  82         itemRightDelim:   "right delim",
  83         itemString:       "string",
  84         itemVariable:     "variable",
  85         // keywords
  86         itemDot:      ".",
  87         itemDefine:   "define",
  88         itemElse:     "else",
  89         itemIf:       "if",
  90         itemEnd:      "end",
  91         itemRange:    "range",
  92         itemTemplate: "template",
  93         itemWith:     "with",
  94 }
  95
  96 func (i itemType) String() string {
  97         s := itemName[i]
  98         if s == "" {
  99                 return fmt.Sprintf("item%d", int(i))
 100         }
 101         return s
 102 }
 103
 104 var key = map[string]itemType{
 105         ".":        itemDot,
 106         "define":   itemDefine,
 107         "else":     itemElse,
 108         "end":      itemEnd,
 109         "if":       itemIf,
 110         "range":    itemRange,
 111         "template": itemTemplate,
 112         "with":     itemWith,
 113 }
 114
 115 const eof = -1
 116
 117 // stateFn represents the state of the scanner as a function that returns the next state.
 118 type stateFn func(*lexer) stateFn
 119
 120 // lexer holds the state of the scanner.
 121 type lexer struct {
 122         name       string    // the name of the input; used only for error reports.
 123         input      string    // the string being scanned.
 124         leftDelim  string    // start of action.
 125         rightDelim string    // end of action.
 126         state      stateFn   // the next lexing function to enter.
 127         pos        int       // current position in the input.
 128         start      int       // start position of this item.
 129         width      int       // width of last rune read from input.
 130         items      chan item // channel of scanned items.
 131 }
 132
 133 // next returns the next rune in the input.
 134 func (l *lexer) next() (r rune) {
 135         if l.pos >= len(l.input) {
 136                 l.width = 0
 137                 return eof
 138         }
 139         r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
 140         l.pos += l.width
 141         return r
 142 }
 143
 144 // peek returns but does not consume the next rune in the input.
 145 func (l *lexer) peek() rune {
 146         r := l.next()
 147         l.backup()
 148         return r
 149 }
 150
 151 // backup steps back one rune. Can only be called once per call of next.
 152 func (l *lexer) backup() {
 153         l.pos -= l.width
 154 }
 155
 156 // emit passes an item back to the client.
 157 func (l *lexer) emit(t itemType) {
 158         l.items <- item{t, l.input[l.start:l.pos]}
 159         l.start = l.pos
 160 }
 161
 162 // ignore skips over the pending input before this point.
 163 func (l *lexer) ignore() {
 164         l.start = l.pos
 165 }
 166
 167 // accept consumes the next rune if it's from the valid set.
 168 func (l *lexer) accept(valid string) bool {
 169         if strings.IndexRune(valid, l.next()) >= 0 {
 170                 return true
 171         }
 172         l.backup()
 173         return false
 174 }
 175
 176 // acceptRun consumes a run of runes from the valid set.
 177 func (l *lexer) acceptRun(valid string) {
 178         for strings.IndexRune(valid, l.next()) >= 0 {
 179         }
 180         l.backup()
 181 }
 182
 183 // lineNumber reports which line we're on. Doing it this way
 184 // means we don't have to worry about peek double counting.
 185 func (l *lexer) lineNumber() int {
 186         return 1 + strings.Count(l.input[:l.pos], "\n")
 187 }
 188
 189 // error returns an error token and terminates the scan by passing
 190 // back a nil pointer that will be the next state, terminating l.nextItem.
 191 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
 192         l.items <- item{itemError, fmt.Sprintf(format, args...)}
 193         return nil
 194 }
 195
 196 // nextItem returns the next item from the input.
 197 func (l *lexer) nextItem() item {
 198         for {
 199                 select {
 200                 case item := <-l.items:
 201                         return item
 202                 default:
 203                         l.state = l.state(l)
 204                 }
 205         }
 206         panic("not reached")
 207 }
 208
 209 // lex creates a new scanner for the input string.
 210 func lex(name, input, left, right string) *lexer {
 211         if left == "" {
 212                 left = leftDelim
 213         }
 214         if right == "" {
 215                 right = rightDelim
 216         }
 217         l := &lexer{
 218                 name:       name,
 219                 input:      input,
 220                 leftDelim:  left,
 221                 rightDelim: right,
 222                 state:      lexText,
 223                 items:      make(chan item, 2), // Two items of buffering is sufficient for all state functions
 224         }
 225         return l
 226 }
 227
 228 // state functions
 229
 230 const (
 231         leftDelim    = "{{"
 232         rightDelim   = "}}"
 233         leftComment  = "/*"
 234         rightComment = "*/"
 235 )
 236
 237 // lexText scans until an opening action delimiter, "{{".
 238 func lexText(l *lexer) stateFn {
 239         for {
 240                 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
 241                         if l.pos > l.start {
 242                                 l.emit(itemText)
 243                         }
 244                         return lexLeftDelim
 245                 }
 246                 if l.next() == eof {
 247                         break
 248                 }
 249         }
 250         // Correctly reached EOF.
 251         if l.pos > l.start {
 252                 l.emit(itemText)
 253         }
 254         l.emit(itemEOF)
 255         return nil
 256 }
 257
 258 // lexLeftDelim scans the left delimiter, which is known to be present.
 259 func lexLeftDelim(l *lexer) stateFn {
 260         l.pos += len(l.leftDelim)
 261         if strings.HasPrefix(l.input[l.pos:], leftComment) {
 262                 return lexComment
 263         }
 264         l.emit(itemLeftDelim)
 265         return lexInsideAction
 266 }
 267
 268 // lexComment scans a comment. The left comment marker is known to be present.
 269 func lexComment(l *lexer) stateFn {
 270         l.pos += len(leftComment)
 271         i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim)
 272         if i < 0 {
 273                 return l.errorf("unclosed comment")
 274         }
 275         l.pos += i + len(rightComment) + len(l.rightDelim)
 276         l.ignore()
 277         return lexText
 278 }
 279
 280 // lexRightDelim scans the right delimiter, which is known to be present.
 281 func lexRightDelim(l *lexer) stateFn {
 282         l.pos += len(l.rightDelim)
 283         l.emit(itemRightDelim)
 284         return lexText
 285 }
 286
 287 // lexInsideAction scans the elements inside action delimiters.
 288 func lexInsideAction(l *lexer) stateFn {
 289         // Either number, quoted string, or identifier.
 290         // Spaces separate and are ignored.
 291         // Pipe symbols separate and are emitted.
 292         if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
 293                 return lexRightDelim
 294         }
 295         switch r := l.next(); {
 296         case r == eof || r == '\n':
 297                 return l.errorf("unclosed action")
 298         case isSpace(r):
 299                 l.ignore()
 300         case r == ':':
 301                 if l.next() != '=' {
 302                         return l.errorf("expected :=")
 303                 }
 304                 l.emit(itemColonEquals)
 305         case r == '|':
 306                 l.emit(itemPipe)
 307         case r == '"':
 308                 return lexQuote
 309         case r == '`':
 310                 return lexRawQuote
 311         case r == '$':
 312                 return lexIdentifier
 313         case r == '\'':
 314                 return lexChar
 315         case r == '.':
 316                 // special look-ahead for ".field" so we don't break l.backup().
 317                 if l.pos < len(l.input) {
 318                         r := l.input[l.pos]
 319                         if r < '0' || '9' < r {
 320                                 return lexIdentifier // itemDot comes from the keyword table.
 321                         }
 322                 }
 323                 fallthrough // '.' can start a number.
 324         case r == '+' || r == '-' || ('0' <= r && r <= '9'):
 325                 l.backup()
 326                 return lexNumber
 327         case isAlphaNumeric(r):
 328                 l.backup()
 329                 return lexIdentifier
 330         case r <= unicode.MaxASCII && unicode.IsPrint(r):
 331                 l.emit(itemChar)
 332                 return lexInsideAction
 333         default:
 334                 return l.errorf("unrecognized character in action: %#U", r)
 335         }
 336         return lexInsideAction
 337 }
 338
 339 // lexIdentifier scans an alphanumeric or field.
 340 func lexIdentifier(l *lexer) stateFn {
 341 Loop:
 342         for {
 343                 switch r := l.next(); {
 344                 case isAlphaNumeric(r):
 345                         // absorb.
 346                 case r == '.' && (l.input[l.start] == '.' || l.input[l.start] == '$'):
 347                         // field chaining; absorb into one token.
 348                 default:
 349                         l.backup()
 350                         word := l.input[l.start:l.pos]
 351                         if !l.atTerminator() {
 352                                 return l.errorf("unexpected character %+U", r)
 353                         }
 354                         switch {
 355                         case key[word] > itemKeyword:
 356                                 l.emit(key[word])
 357                         case word[0] == '.':
 358                                 l.emit(itemField)
 359                         case word[0] == '$':
 360                                 l.emit(itemVariable)
 361                         case word == "true", word == "false":
 362                                 l.emit(itemBool)
 363                         default:
 364                                 l.emit(itemIdentifier)
 365                         }
 366                         break Loop
 367                 }
 368         }
 369         return lexInsideAction
 370 }
 371
 372 // atTerminator reports whether the input is at valid termination character to
 373 // appear after an identifier. Mostly to catch cases like "$x+2" not being
 374 // acceptable without a space, in case we decide one day to implement
 375 // arithmetic.
 376 func (l *lexer) atTerminator() bool {
 377         r := l.peek()
 378         if isSpace(r) {
 379                 return true
 380         }
 381         switch r {
 382         case eof, ',', '|', ':':
 383                 return true
 384         }
 385         // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
 386         // succeed but should fail) but only in extremely rare cases caused by willfully
 387         // bad choice of delimiter.
 388         if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
 389                 return true
 390         }
 391         return false
 392 }
 393
 394 // lexChar scans a character constant. The initial quote is already
 395 // scanned.  Syntax checking is done by the parse.
 396 func lexChar(l *lexer) stateFn {
 397 Loop:
 398         for {
 399                 switch l.next() {
 400                 case '\\':
 401                         if r := l.next(); r != eof && r != '\n' {
 402                                 break
 403                         }
 404                         fallthrough
 405                 case eof, '\n':
 406                         return l.errorf("unterminated character constant")
 407                 case '\'':
 408                         break Loop
 409                 }
 410         }
 411         l.emit(itemCharConstant)
 412         return lexInsideAction
 413 }
 414
 415 // lexNumber scans a number: decimal, octal, hex, float, or imaginary.  This
 416 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
 417 // and "089" - but when it's wrong the input is invalid and the parser (via
 418 // strconv) will notice.
 419 func lexNumber(l *lexer) stateFn {
 420         if !l.scanNumber() {
 421                 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 422         }
 423         if sign := l.peek(); sign == '+' || sign == '-' {
 424                 // Complex: 1+2i.  No spaces, must end in 'i'.
 425                 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
 426                         return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 427                 }
 428                 l.emit(itemComplex)
 429         } else {
 430                 l.emit(itemNumber)
 431         }
 432         return lexInsideAction
 433 }
 434
 435 func (l *lexer) scanNumber() bool {
 436         // Optional leading sign.
 437         l.accept("+-")
 438         // Is it hex?
 439         digits := "0123456789"
 440         if l.accept("0") && l.accept("xX") {
 441                 digits = "0123456789abcdefABCDEF"
 442         }
 443         l.acceptRun(digits)
 444         if l.accept(".") {
 445                 l.acceptRun(digits)
 446         }
 447         if l.accept("eE") {
 448                 l.accept("+-")
 449                 l.acceptRun("0123456789")
 450         }
 451         // Is it imaginary?
 452         l.accept("i")
 453         // Next thing mustn't be alphanumeric.
 454         if isAlphaNumeric(l.peek()) {
 455                 l.next()
 456                 return false
 457         }
 458         return true
 459 }
 460
 461 // lexQuote scans a quoted string.
 462 func lexQuote(l *lexer) stateFn {
 463 Loop:
 464         for {
 465                 switch l.next() {
 466                 case '\\':
 467                         if r := l.next(); r != eof && r != '\n' {
 468                                 break
 469                         }
 470                         fallthrough
 471                 case eof, '\n':
 472                         return l.errorf("unterminated quoted string")
 473                 case '"':
 474                         break Loop
 475                 }
 476         }
 477         l.emit(itemString)
 478         return lexInsideAction
 479 }
 480
 481 // lexRawQuote scans a raw quoted string.
 482 func lexRawQuote(l *lexer) stateFn {
 483 Loop:
 484         for {
 485                 switch l.next() {
 486                 case eof, '\n':
 487                         return l.errorf("unterminated raw quoted string")
 488                 case '`':
 489                         break Loop
 490                 }
 491         }
 492         l.emit(itemRawString)
 493         return lexInsideAction
 494 }
 495
 496 // isSpace reports whether r is a space character.
 497 func isSpace(r rune) bool {
 498         switch r {
 499         case ' ', '\t', '\n', '\r':
 500                 return true
 501         }
 502         return false
 503 }
 504
 505 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
 506 func isAlphaNumeric(r rune) bool {
 507         return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
 508 }