1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package scanner implements a scanner for Go source text. Takes a []byte as
6 // source which can then be tokenized through repeated calls to the Scan
7 // function. Typical use:
9 // var s scanner.Scanner
10 // fset := token.NewFileSet() // position information is relative to fset
11 // file := fset.AddFile(filename, fset.Base(), len(src)) // register file
12 // s.Init(file, src, nil /* no error handler */, 0)
14 // pos, tok, lit := s.Scan()
15 // if tok == token.EOF {
18 // // do something here with pos, tok, and lit
33 // A Scanner holds the scanner's internal state while processing
34 // a given text. It can be allocated as part of another data
35 // structure but must be initialized via Init before use.
39 file *token.File // source file handle
40 dir string // directory portion of file.Name()
42 err ErrorHandler // error reporting; or nil
43 mode Mode // scanning mode
46 ch rune // current character
47 offset int // character offset
48 rdOffset int // reading offset (position after current character)
49 lineOffset int // current line offset
50 insertSemi bool // insert a semicolon before next newline
52 // public state - ok to modify
53 ErrorCount int // number of errors encountered
56 // Read the next Unicode char into s.ch.
57 // s.ch < 0 means end-of-file.
59 func (s *Scanner) next() {
60 if s.rdOffset < len(s.src) {
63 s.lineOffset = s.offset
64 s.file.AddLine(s.offset)
66 r, w := rune(s.src[s.rdOffset]), 1
69 s.error(s.offset, "illegal character NUL")
72 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
73 if r == utf8.RuneError && w == 1 {
74 s.error(s.offset, "illegal UTF-8 encoding")
82 s.lineOffset = s.offset
83 s.file.AddLine(s.offset)
89 // A mode value is set of flags (or 0).
90 // They control scanner behavior.
95 ScanComments Mode = 1 << iota // return comments as COMMENT tokens
96 dontInsertSemis // do not automatically insert semicolons - for testing only
99 // Init prepares the scanner s to tokenize the text src by setting the
100 // scanner at the beginning of src. The scanner uses the file set file
101 // for position information and it adds line information for each line.
102 // It is ok to re-use the same file when re-scanning the same file as
103 // line information which is already present is ignored. Init causes a
104 // panic if the file size does not match the src size.
106 // Calls to Scan will use the error handler err if they encounter a
107 // syntax error and err is not nil. Also, for each error encountered,
108 // the Scanner field ErrorCount is incremented by one. The mode parameter
109 // determines how comments are handled.
111 // Note that Init may call err if there is an error in the first character
114 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
115 // Explicitly initialize all fields since a scanner may be reused.
116 if file.Size() != len(src) {
117 panic("file size does not match src len")
120 s.dir, _ = filepath.Split(file.Name())
135 func (s *Scanner) error(offs int, msg string) {
137 s.err.Error(s.file.Position(s.file.Pos(offs)), msg)
142 var prefix = []byte("//line ")
144 func (s *Scanner) interpretLineComment(text []byte) {
145 if bytes.HasPrefix(text, prefix) {
146 // get filename and line number, if any
147 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
148 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
149 // valid //line filename:line comment;
150 filename := filepath.Clean(string(text[len(prefix):i]))
151 if !filepath.IsAbs(filename) {
152 // make filename relative to current directory
153 filename = filepath.Join(s.dir, filename)
155 // update scanner position
156 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
162 func (s *Scanner) scanComment() string {
163 // initial '/' already consumed; s.ch == '/' || s.ch == '*'
164 offs := s.offset - 1 // position of initial '/'
169 for s.ch != '\n' && s.ch >= 0 {
172 if offs == s.lineOffset {
173 // comment starts at the beginning of the current line
174 s.interpretLineComment(s.src[offs:s.offset])
184 if ch == '*' && s.ch == '/' {
190 s.error(offs, "comment not terminated")
193 return string(s.src[offs:s.offset])
196 func (s *Scanner) findLineEnd() bool {
197 // initial '/' already consumed
199 defer func(offs int) {
200 // reset scanner state to where it was upon calling findLineEnd
203 s.rdOffset = offs + 1
204 s.next() // consume initial '/' again
207 // read ahead until a newline, EOF, or non-comment token is found
208 for s.ch == '/' || s.ch == '*' {
210 //-style comment always contains a newline
213 /*-style comment: look for newline */
221 if ch == '*' && s.ch == '/' {
226 s.skipWhitespace() // s.insertSemi is set
227 if s.ch < 0 || s.ch == '\n' {
234 s.next() // consume '/'
240 func isLetter(ch rune) bool {
241 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
244 func isDigit(ch rune) bool {
245 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
248 func (s *Scanner) scanIdentifier() string {
250 for isLetter(s.ch) || isDigit(s.ch) {
253 return string(s.src[offs:s.offset])
256 func digitVal(ch rune) int {
258 case '0' <= ch && ch <= '9':
260 case 'a' <= ch && ch <= 'f':
261 return int(ch - 'a' + 10)
262 case 'A' <= ch && ch <= 'F':
263 return int(ch - 'A' + 10)
265 return 16 // larger than any legal digit val
268 func (s *Scanner) scanMantissa(base int) {
269 for digitVal(s.ch) < base {
274 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
275 // digitVal(s.ch) < 10
279 if seenDecimalPoint {
290 if s.ch == 'x' || s.ch == 'X' {
294 if s.offset-offs <= 2 {
295 // only scanned "0x" or "0X"
296 s.error(offs, "illegal hexadecimal number")
299 // octal int or float
300 seenDecimalDigit := false
302 if s.ch == '8' || s.ch == '9' {
303 // illegal octal int or float
304 seenDecimalDigit = true
307 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
311 if seenDecimalDigit {
312 s.error(offs, "illegal octal number")
318 // decimal int or float
329 if s.ch == 'e' || s.ch == 'E' {
332 if s.ch == '-' || s.ch == '+' {
344 return tok, string(s.src[offs:s.offset])
347 func (s *Scanner) scanEscape(quote rune) {
350 var i, base, max uint32
352 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
355 case '0', '1', '2', '3', '4', '5', '6', '7':
356 i, base, max = 3, 8, 255
359 i, base, max = 2, 16, 255
362 i, base, max = 4, 16, unicode.MaxRune
365 i, base, max = 8, 16, unicode.MaxRune
367 s.next() // always make progress
368 s.error(offs, "unknown escape sequence")
373 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
374 d := uint32(digitVal(s.ch))
376 s.error(s.offset, "illegal character in escape sequence")
382 // in case of an error, consume remaining chars
383 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
386 if x > max || 0xd800 <= x && x < 0xe000 {
387 s.error(offs, "escape sequence is invalid Unicode code point")
391 func (s *Scanner) scanChar() string {
392 // '\'' opening already consumed
400 if ch == '\n' || ch < 0 {
401 s.error(offs, "character literal not terminated")
413 s.error(offs, "illegal character literal")
416 return string(s.src[offs:s.offset])
419 func (s *Scanner) scanString() string {
420 // '"' opening already consumed
426 if ch == '\n' || ch < 0 {
427 s.error(offs, "string not terminated")
437 return string(s.src[offs:s.offset])
440 func stripCR(b []byte) []byte {
441 c := make([]byte, len(b))
443 for _, ch := range b {
452 func (s *Scanner) scanRawString() string {
453 // '`' opening already consumed
464 s.error(offs, "string not terminated")
471 lit := s.src[offs:s.offset]
479 func (s *Scanner) skipWhitespace() {
480 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
485 // Helper functions for scanning multi-byte tokens such as >> += >>= .
486 // Different routines recognize different length tok_i based on matches
487 // of ch_i. If a token ends in '=', the result is tok1 or tok3
488 // respectively. Otherwise, the result is tok0 if there was no other
489 // matching character, or tok2 if the matching character was ch2.
491 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
499 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
511 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
527 // Scan scans the next token and returns the token position, the token,
528 // and its literal string if applicable. The source end is indicated by
531 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
532 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
533 // has the corresponding value.
535 // If the returned token is token.SEMICOLON, the corresponding
536 // literal string is ";" if the semicolon was present in the source,
537 // and "\n" if the semicolon was inserted because of a newline or
540 // If the returned token is token.ILLEGAL, the literal string is the
541 // offending character.
543 // In all other cases, Scan returns an empty literal string.
545 // For more tolerant parsing, Scan will return a valid token if
546 // possible even if a syntax error was encountered. Thus, even
547 // if the resulting token sequence contains no illegal tokens,
548 // a client may not assume that no error occurred. Instead it
549 // must check the scanner's ErrorCount or the number of calls
550 // of the error handler, if there was one installed.
552 // Scan adds line information to the file added to the file
553 // set with Init. Token positions are relative to that file
554 // and thus relative to the file set.
556 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
560 // current token start
561 pos = s.file.Pos(s.offset)
563 // determine token value
567 lit = s.scanIdentifier()
568 tok = token.Lookup(lit)
570 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
573 case digitVal(ch) < 10:
575 tok, lit = s.scanNumber(false)
577 s.next() // always make progress
581 s.insertSemi = false // EOF consumed
582 return pos, token.SEMICOLON, "\n"
586 // we only reach here if s.insertSemi was
587 // set in the first place and exited early
588 // from s.skipWhitespace()
589 s.insertSemi = false // newline consumed
590 return pos, token.SEMICOLON, "\n"
602 lit = s.scanRawString()
604 tok = s.switch2(token.COLON, token.DEFINE)
606 if digitVal(s.ch) < 10 {
608 tok, lit = s.scanNumber(true)
609 } else if s.ch == '.' {
621 tok = token.SEMICOLON
639 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
640 if tok == token.INC {
644 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
645 if tok == token.DEC {
649 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
651 if s.ch == '/' || s.ch == '*' {
653 if s.insertSemi && s.findLineEnd() {
654 // reset position to the beginning of the comment
656 s.offset = s.file.Offset(pos)
657 s.rdOffset = s.offset + 1
658 s.insertSemi = false // newline consumed
659 return pos, token.SEMICOLON, "\n"
661 lit = s.scanComment()
662 if s.mode&ScanComments == 0 {
664 s.insertSemi = false // newline consumed
669 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
672 tok = s.switch2(token.REM, token.REM_ASSIGN)
674 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
680 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
683 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
685 tok = s.switch2(token.ASSIGN, token.EQL)
687 tok = s.switch2(token.NOT, token.NEQ)
691 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
693 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
696 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
698 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
699 insertSemi = s.insertSemi // preserve insertSemi info
704 if s.mode&dontInsertSemis == 0 {
705 s.insertSemi = insertSemi