libgo/go/go/scanner/scanner.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package scanner implements a scanner for Go source text. Takes a []byte as
   6 // source which can then be tokenized through repeated calls to the Scan
   7 // function. Typical use:
   8 //
   9 //      var s scanner.Scanner
  10 //      fset := token.NewFileSet()  // position information is relative to fset
  11 //      file := fset.AddFile(filename, fset.Base(), len(src))  // register file
  12 //      s.Init(file, src, nil /* no error handler */, 0)
  13 //      for {
  14 //              pos, tok, lit := s.Scan()
  15 //              if tok == token.EOF {
  16 //                      break
  17 //              }
  18 //              // do something here with pos, tok, and lit
  19 //      }
  20 //
  21 package scanner
  22
  23 import (
  24         "bytes"
  25         "fmt"
  26         "go/token"
  27         "path/filepath"
  28         "strconv"
  29         "unicode"
  30         "unicode/utf8"
  31 )
  32
  33 // A Scanner holds the scanner's internal state while processing
  34 // a given text.  It can be allocated as part of another data
  35 // structure but must be initialized via Init before use.
  36 //
  37 type Scanner struct {
  38         // immutable state
  39         file *token.File  // source file handle
  40         dir  string       // directory portion of file.Name()
  41         src  []byte       // source
  42         err  ErrorHandler // error reporting; or nil
  43         mode Mode         // scanning mode
  44
  45         // scanning state
  46         ch         rune // current character
  47         offset     int  // character offset
  48         rdOffset   int  // reading offset (position after current character)
  49         lineOffset int  // current line offset
  50         insertSemi bool // insert a semicolon before next newline
  51
  52         // public state - ok to modify
  53         ErrorCount int // number of errors encountered
  54 }
  55
  56 // Read the next Unicode char into s.ch.
  57 // s.ch < 0 means end-of-file.
  58 //
  59 func (s *Scanner) next() {
  60         if s.rdOffset < len(s.src) {
  61                 s.offset = s.rdOffset
  62                 if s.ch == '\n' {
  63                         s.lineOffset = s.offset
  64                         s.file.AddLine(s.offset)
  65                 }
  66                 r, w := rune(s.src[s.rdOffset]), 1
  67                 switch {
  68                 case r == 0:
  69                         s.error(s.offset, "illegal character NUL")
  70                 case r >= 0x80:
  71                         // not ASCII
  72                         r, w = utf8.DecodeRune(s.src[s.rdOffset:])
  73                         if r == utf8.RuneError && w == 1 {
  74                                 s.error(s.offset, "illegal UTF-8 encoding")
  75                         }
  76                 }
  77                 s.rdOffset += w
  78                 s.ch = r
  79         } else {
  80                 s.offset = len(s.src)
  81                 if s.ch == '\n' {
  82                         s.lineOffset = s.offset
  83                         s.file.AddLine(s.offset)
  84                 }
  85                 s.ch = -1 // eof
  86         }
  87 }
  88
  89 // A mode value is set of flags (or 0).
  90 // They control scanner behavior.
  91 //
  92 type Mode uint
  93
  94 const (
  95         ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
  96         dontInsertSemis                  // do not automatically insert semicolons - for testing only
  97 )
  98
  99 // Init prepares the scanner s to tokenize the text src by setting the
 100 // scanner at the beginning of src. The scanner uses the file set file
 101 // for position information and it adds line information for each line.
 102 // It is ok to re-use the same file when re-scanning the same file as
 103 // line information which is already present is ignored. Init causes a
 104 // panic if the file size does not match the src size.
 105 //
 106 // Calls to Scan will use the error handler err if they encounter a
 107 // syntax error and err is not nil. Also, for each error encountered,
 108 // the Scanner field ErrorCount is incremented by one. The mode parameter
 109 // determines how comments are handled.
 110 //
 111 // Note that Init may call err if there is an error in the first character
 112 // of the file.
 113 //
 114 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
 115         // Explicitly initialize all fields since a scanner may be reused.
 116         if file.Size() != len(src) {
 117                 panic("file size does not match src len")
 118         }
 119         s.file = file
 120         s.dir, _ = filepath.Split(file.Name())
 121         s.src = src
 122         s.err = err
 123         s.mode = mode
 124
 125         s.ch = ' '
 126         s.offset = 0
 127         s.rdOffset = 0
 128         s.lineOffset = 0
 129         s.insertSemi = false
 130         s.ErrorCount = 0
 131
 132         s.next()
 133 }
 134
 135 func (s *Scanner) error(offs int, msg string) {
 136         if s.err != nil {
 137                 s.err.Error(s.file.Position(s.file.Pos(offs)), msg)
 138         }
 139         s.ErrorCount++
 140 }
 141
 142 var prefix = []byte("//line ")
 143
 144 func (s *Scanner) interpretLineComment(text []byte) {
 145         if bytes.HasPrefix(text, prefix) {
 146                 // get filename and line number, if any
 147                 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
 148                         if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
 149                                 // valid //line filename:line comment;
 150                                 filename := filepath.Clean(string(text[len(prefix):i]))
 151                                 if !filepath.IsAbs(filename) {
 152                                         // make filename relative to current directory
 153                                         filename = filepath.Join(s.dir, filename)
 154                                 }
 155                                 // update scanner position
 156                                 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
 157                         }
 158                 }
 159         }
 160 }
 161
 162 func (s *Scanner) scanComment() string {
 163         // initial '/' already consumed; s.ch == '/' || s.ch == '*'
 164         offs := s.offset - 1 // position of initial '/'
 165
 166         if s.ch == '/' {
 167                 //-style comment
 168                 s.next()
 169                 for s.ch != '\n' && s.ch >= 0 {
 170                         s.next()
 171                 }
 172                 if offs == s.lineOffset {
 173                         // comment starts at the beginning of the current line
 174                         s.interpretLineComment(s.src[offs:s.offset])
 175                 }
 176                 goto exit
 177         }
 178
 179         /*-style comment */
 180         s.next()
 181         for s.ch >= 0 {
 182                 ch := s.ch
 183                 s.next()
 184                 if ch == '*' && s.ch == '/' {
 185                         s.next()
 186                         goto exit
 187                 }
 188         }
 189
 190         s.error(offs, "comment not terminated")
 191
 192 exit:
 193         return string(s.src[offs:s.offset])
 194 }
 195
 196 func (s *Scanner) findLineEnd() bool {
 197         // initial '/' already consumed
 198
 199         defer func(offs int) {
 200                 // reset scanner state to where it was upon calling findLineEnd
 201                 s.ch = '/'
 202                 s.offset = offs
 203                 s.rdOffset = offs + 1
 204                 s.next() // consume initial '/' again
 205         }(s.offset - 1)
 206
 207         // read ahead until a newline, EOF, or non-comment token is found
 208         for s.ch == '/' || s.ch == '*' {
 209                 if s.ch == '/' {
 210                         //-style comment always contains a newline
 211                         return true
 212                 }
 213                 /*-style comment: look for newline */
 214                 s.next()
 215                 for s.ch >= 0 {
 216                         ch := s.ch
 217                         if ch == '\n' {
 218                                 return true
 219                         }
 220                         s.next()
 221                         if ch == '*' && s.ch == '/' {
 222                                 s.next()
 223                                 break
 224                         }
 225                 }
 226                 s.skipWhitespace() // s.insertSemi is set
 227                 if s.ch < 0 || s.ch == '\n' {
 228                         return true
 229                 }
 230                 if s.ch != '/' {
 231                         // non-comment token
 232                         return false
 233                 }
 234                 s.next() // consume '/'
 235         }
 236
 237         return false
 238 }
 239
 240 func isLetter(ch rune) bool {
 241         return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 242 }
 243
 244 func isDigit(ch rune) bool {
 245         return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 246 }
 247
 248 func (s *Scanner) scanIdentifier() string {
 249         offs := s.offset
 250         for isLetter(s.ch) || isDigit(s.ch) {
 251                 s.next()
 252         }
 253         return string(s.src[offs:s.offset])
 254 }
 255
 256 func digitVal(ch rune) int {
 257         switch {
 258         case '0' <= ch && ch <= '9':
 259                 return int(ch - '0')
 260         case 'a' <= ch && ch <= 'f':
 261                 return int(ch - 'a' + 10)
 262         case 'A' <= ch && ch <= 'F':
 263                 return int(ch - 'A' + 10)
 264         }
 265         return 16 // larger than any legal digit val
 266 }
 267
 268 func (s *Scanner) scanMantissa(base int) {
 269         for digitVal(s.ch) < base {
 270                 s.next()
 271         }
 272 }
 273
 274 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
 275         // digitVal(s.ch) < 10
 276         offs := s.offset
 277         tok := token.INT
 278
 279         if seenDecimalPoint {
 280                 offs--
 281                 tok = token.FLOAT
 282                 s.scanMantissa(10)
 283                 goto exponent
 284         }
 285
 286         if s.ch == '0' {
 287                 // int or float
 288                 offs := s.offset
 289                 s.next()
 290                 if s.ch == 'x' || s.ch == 'X' {
 291                         // hexadecimal int
 292                         s.next()
 293                         s.scanMantissa(16)
 294                         if s.offset-offs <= 2 {
 295                                 // only scanned "0x" or "0X"
 296                                 s.error(offs, "illegal hexadecimal number")
 297                         }
 298                 } else {
 299                         // octal int or float
 300                         seenDecimalDigit := false
 301                         s.scanMantissa(8)
 302                         if s.ch == '8' || s.ch == '9' {
 303                                 // illegal octal int or float
 304                                 seenDecimalDigit = true
 305                                 s.scanMantissa(10)
 306                         }
 307                         if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
 308                                 goto fraction
 309                         }
 310                         // octal int
 311                         if seenDecimalDigit {
 312                                 s.error(offs, "illegal octal number")
 313                         }
 314                 }
 315                 goto exit
 316         }
 317
 318         // decimal int or float
 319         s.scanMantissa(10)
 320
 321 fraction:
 322         if s.ch == '.' {
 323                 tok = token.FLOAT
 324                 s.next()
 325                 s.scanMantissa(10)
 326         }
 327
 328 exponent:
 329         if s.ch == 'e' || s.ch == 'E' {
 330                 tok = token.FLOAT
 331                 s.next()
 332                 if s.ch == '-' || s.ch == '+' {
 333                         s.next()
 334                 }
 335                 s.scanMantissa(10)
 336         }
 337
 338         if s.ch == 'i' {
 339                 tok = token.IMAG
 340                 s.next()
 341         }
 342
 343 exit:
 344         return tok, string(s.src[offs:s.offset])
 345 }
 346
 347 func (s *Scanner) scanEscape(quote rune) {
 348         offs := s.offset
 349
 350         var i, base, max uint32
 351         switch s.ch {
 352         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
 353                 s.next()
 354                 return
 355         case '0', '1', '2', '3', '4', '5', '6', '7':
 356                 i, base, max = 3, 8, 255
 357         case 'x':
 358                 s.next()
 359                 i, base, max = 2, 16, 255
 360         case 'u':
 361                 s.next()
 362                 i, base, max = 4, 16, unicode.MaxRune
 363         case 'U':
 364                 s.next()
 365                 i, base, max = 8, 16, unicode.MaxRune
 366         default:
 367                 s.next() // always make progress
 368                 s.error(offs, "unknown escape sequence")
 369                 return
 370         }
 371
 372         var x uint32
 373         for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
 374                 d := uint32(digitVal(s.ch))
 375                 if d >= base {
 376                         s.error(s.offset, "illegal character in escape sequence")
 377                         break
 378                 }
 379                 x = x*base + d
 380                 s.next()
 381         }
 382         // in case of an error, consume remaining chars
 383         for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
 384                 s.next()
 385         }
 386         if x > max || 0xd800 <= x && x < 0xe000 {
 387                 s.error(offs, "escape sequence is invalid Unicode code point")
 388         }
 389 }
 390
 391 func (s *Scanner) scanChar() string {
 392         // '\'' opening already consumed
 393         offs := s.offset - 1
 394
 395         n := 0
 396         for s.ch != '\'' {
 397                 ch := s.ch
 398                 n++
 399                 s.next()
 400                 if ch == '\n' || ch < 0 {
 401                         s.error(offs, "character literal not terminated")
 402                         n = 1
 403                         break
 404                 }
 405                 if ch == '\\' {
 406                         s.scanEscape('\'')
 407                 }
 408         }
 409
 410         s.next()
 411
 412         if n != 1 {
 413                 s.error(offs, "illegal character literal")
 414         }
 415
 416         return string(s.src[offs:s.offset])
 417 }
 418
 419 func (s *Scanner) scanString() string {
 420         // '"' opening already consumed
 421         offs := s.offset - 1
 422
 423         for s.ch != '"' {
 424                 ch := s.ch
 425                 s.next()
 426                 if ch == '\n' || ch < 0 {
 427                         s.error(offs, "string not terminated")
 428                         break
 429                 }
 430                 if ch == '\\' {
 431                         s.scanEscape('"')
 432                 }
 433         }
 434
 435         s.next()
 436
 437         return string(s.src[offs:s.offset])
 438 }
 439
 440 func stripCR(b []byte) []byte {
 441         c := make([]byte, len(b))
 442         i := 0
 443         for _, ch := range b {
 444                 if ch != '\r' {
 445                         c[i] = ch
 446                         i++
 447                 }
 448         }
 449         return c[:i]
 450 }
 451
 452 func (s *Scanner) scanRawString() string {
 453         // '`' opening already consumed
 454         offs := s.offset - 1
 455
 456         hasCR := false
 457         for s.ch != '`' {
 458                 ch := s.ch
 459                 s.next()
 460                 if ch == '\r' {
 461                         hasCR = true
 462                 }
 463                 if ch < 0 {
 464                         s.error(offs, "string not terminated")
 465                         break
 466                 }
 467         }
 468
 469         s.next()
 470
 471         lit := s.src[offs:s.offset]
 472         if hasCR {
 473                 lit = stripCR(lit)
 474         }
 475
 476         return string(lit)
 477 }
 478
 479 func (s *Scanner) skipWhitespace() {
 480         for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
 481                 s.next()
 482         }
 483 }
 484
 485 // Helper functions for scanning multi-byte tokens such as >> += >>= .
 486 // Different routines recognize different length tok_i based on matches
 487 // of ch_i. If a token ends in '=', the result is tok1 or tok3
 488 // respectively. Otherwise, the result is tok0 if there was no other
 489 // matching character, or tok2 if the matching character was ch2.
 490
 491 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
 492         if s.ch == '=' {
 493                 s.next()
 494                 return tok1
 495         }
 496         return tok0
 497 }
 498
 499 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
 500         if s.ch == '=' {
 501                 s.next()
 502                 return tok1
 503         }
 504         if s.ch == ch2 {
 505                 s.next()
 506                 return tok2
 507         }
 508         return tok0
 509 }
 510
 511 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
 512         if s.ch == '=' {
 513                 s.next()
 514                 return tok1
 515         }
 516         if s.ch == ch2 {
 517                 s.next()
 518                 if s.ch == '=' {
 519                         s.next()
 520                         return tok3
 521                 }
 522                 return tok2
 523         }
 524         return tok0
 525 }
 526
 527 // Scan scans the next token and returns the token position, the token,
 528 // and its literal string if applicable. The source end is indicated by
 529 // token.EOF.
 530 //
 531 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
 532 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
 533 // has the corresponding value.
 534 //
 535 // If the returned token is token.SEMICOLON, the corresponding
 536 // literal string is ";" if the semicolon was present in the source,
 537 // and "\n" if the semicolon was inserted because of a newline or
 538 // at EOF.
 539 //
 540 // If the returned token is token.ILLEGAL, the literal string is the
 541 // offending character.
 542 //
 543 // In all other cases, Scan returns an empty literal string.
 544 //
 545 // For more tolerant parsing, Scan will return a valid token if
 546 // possible even if a syntax error was encountered. Thus, even
 547 // if the resulting token sequence contains no illegal tokens,
 548 // a client may not assume that no error occurred. Instead it
 549 // must check the scanner's ErrorCount or the number of calls
 550 // of the error handler, if there was one installed.
 551 //
 552 // Scan adds line information to the file added to the file
 553 // set with Init. Token positions are relative to that file
 554 // and thus relative to the file set.
 555 //
 556 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
 557 scanAgain:
 558         s.skipWhitespace()
 559
 560         // current token start
 561         pos = s.file.Pos(s.offset)
 562
 563         // determine token value
 564         insertSemi := false
 565         switch ch := s.ch; {
 566         case isLetter(ch):
 567                 lit = s.scanIdentifier()
 568                 tok = token.Lookup(lit)
 569                 switch tok {
 570                 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
 571                         insertSemi = true
 572                 }
 573         case digitVal(ch) < 10:
 574                 insertSemi = true
 575                 tok, lit = s.scanNumber(false)
 576         default:
 577                 s.next() // always make progress
 578                 switch ch {
 579                 case -1:
 580                         if s.insertSemi {
 581                                 s.insertSemi = false // EOF consumed
 582                                 return pos, token.SEMICOLON, "\n"
 583                         }
 584                         tok = token.EOF
 585                 case '\n':
 586                         // we only reach here if s.insertSemi was
 587                         // set in the first place and exited early
 588                         // from s.skipWhitespace()
 589                         s.insertSemi = false // newline consumed
 590                         return pos, token.SEMICOLON, "\n"
 591                 case '"':
 592                         insertSemi = true
 593                         tok = token.STRING
 594                         lit = s.scanString()
 595                 case '\'':
 596                         insertSemi = true
 597                         tok = token.CHAR
 598                         lit = s.scanChar()
 599                 case '`':
 600                         insertSemi = true
 601                         tok = token.STRING
 602                         lit = s.scanRawString()
 603                 case ':':
 604                         tok = s.switch2(token.COLON, token.DEFINE)
 605                 case '.':
 606                         if digitVal(s.ch) < 10 {
 607                                 insertSemi = true
 608                                 tok, lit = s.scanNumber(true)
 609                         } else if s.ch == '.' {
 610                                 s.next()
 611                                 if s.ch == '.' {
 612                                         s.next()
 613                                         tok = token.ELLIPSIS
 614                                 }
 615                         } else {
 616                                 tok = token.PERIOD
 617                         }
 618                 case ',':
 619                         tok = token.COMMA
 620                 case ';':
 621                         tok = token.SEMICOLON
 622                         lit = ";"
 623                 case '(':
 624                         tok = token.LPAREN
 625                 case ')':
 626                         insertSemi = true
 627                         tok = token.RPAREN
 628                 case '[':
 629                         tok = token.LBRACK
 630                 case ']':
 631                         insertSemi = true
 632                         tok = token.RBRACK
 633                 case '{':
 634                         tok = token.LBRACE
 635                 case '}':
 636                         insertSemi = true
 637                         tok = token.RBRACE
 638                 case '+':
 639                         tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
 640                         if tok == token.INC {
 641                                 insertSemi = true
 642                         }
 643                 case '-':
 644                         tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
 645                         if tok == token.DEC {
 646                                 insertSemi = true
 647                         }
 648                 case '*':
 649                         tok = s.switch2(token.MUL, token.MUL_ASSIGN)
 650                 case '/':
 651                         if s.ch == '/' || s.ch == '*' {
 652                                 // comment
 653                                 if s.insertSemi && s.findLineEnd() {
 654                                         // reset position to the beginning of the comment
 655                                         s.ch = '/'
 656                                         s.offset = s.file.Offset(pos)
 657                                         s.rdOffset = s.offset + 1
 658                                         s.insertSemi = false // newline consumed
 659                                         return pos, token.SEMICOLON, "\n"
 660                                 }
 661                                 lit = s.scanComment()
 662                                 if s.mode&ScanComments == 0 {
 663                                         // skip comment
 664                                         s.insertSemi = false // newline consumed
 665                                         goto scanAgain
 666                                 }
 667                                 tok = token.COMMENT
 668                         } else {
 669                                 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
 670                         }
 671                 case '%':
 672                         tok = s.switch2(token.REM, token.REM_ASSIGN)
 673                 case '^':
 674                         tok = s.switch2(token.XOR, token.XOR_ASSIGN)
 675                 case '<':
 676                         if s.ch == '-' {
 677                                 s.next()
 678                                 tok = token.ARROW
 679                         } else {
 680                                 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
 681                         }
 682                 case '>':
 683                         tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
 684                 case '=':
 685                         tok = s.switch2(token.ASSIGN, token.EQL)
 686                 case '!':
 687                         tok = s.switch2(token.NOT, token.NEQ)
 688                 case '&':
 689                         if s.ch == '^' {
 690                                 s.next()
 691                                 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
 692                         } else {
 693                                 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
 694                         }
 695                 case '|':
 696                         tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 697                 default:
 698                         s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
 699                         insertSemi = s.insertSemi // preserve insertSemi info
 700                         tok = token.ILLEGAL
 701                         lit = string(ch)
 702                 }
 703         }
 704         if s.mode&dontInsertSemis == 0 {
 705                 s.insertSemi = insertSemi
 706         }
 707
 708         return
 709 }