libgo/go/ebnf/ebnf.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package ebnf is a library for EBNF grammars. The input is text ([]byte)
   6 // satisfying the following grammar (represented itself in EBNF):
   7 //
   8 //      Production  = name "=" [ Expression ] "." .
   9 //      Expression  = Alternative { "|" Alternative } .
  10 //      Alternative = Term { Term } .
  11 //      Term        = name | token [ "…" token ] | Group | Option | Repetition .
  12 //      Group       = "(" Expression ")" .
  13 //      Option      = "[" Expression "]" .
  14 //      Repetition  = "{" Expression "}" .
  15 //
  16 // A name is a Go identifier, a token is a Go string, and comments
  17 // and white space follow the same rules as for the Go language.
  18 // Production names starting with an uppercase Unicode letter denote
  19 // non-terminal productions (i.e., productions which allow white-space
  20 // and comments between tokens); all other production names denote
  21 // lexical productions.
  22 //
  23 package ebnf
  24
  25 import (
  26         "go/scanner"
  27         "go/token"
  28         "os"
  29         "unicode"
  30         "utf8"
  31 )
  32
  33 // ----------------------------------------------------------------------------
  34 // Internal representation
  35
  36 type (
  37         // An Expression node represents a production expression.
  38         Expression interface {
  39                 // Pos is the position of the first character of the syntactic construct
  40                 Pos() token.Pos
  41         }
  42
  43         // An Alternative node represents a non-empty list of alternative expressions.
  44         Alternative []Expression // x | y | z
  45
  46         // A Sequence node represents a non-empty list of sequential expressions.
  47         Sequence []Expression // x y z
  48
  49         // A Name node represents a production name.
  50         Name struct {
  51                 StringPos token.Pos
  52                 String    string
  53         }
  54
  55         // A Token node represents a literal.
  56         Token struct {
  57                 StringPos token.Pos
  58                 String    string
  59         }
  60
  61         // A List node represents a range of characters.
  62         Range struct {
  63                 Begin, End *Token // begin ... end
  64         }
  65
  66         // A Group node represents a grouped expression.
  67         Group struct {
  68                 Lparen token.Pos
  69                 Body   Expression // (body)
  70         }
  71
  72         // An Option node represents an optional expression.
  73         Option struct {
  74                 Lbrack token.Pos
  75                 Body   Expression // [body]
  76         }
  77
  78         // A Repetition node represents a repeated expression.
  79         Repetition struct {
  80                 Lbrace token.Pos
  81                 Body   Expression // {body}
  82         }
  83
  84         // A Bad node stands for pieces of source code that lead to a parse error.
  85         Bad struct {
  86                 TokPos token.Pos
  87                 Error  string // parser error message
  88         }
  89
  90         // A Production node represents an EBNF production.
  91         Production struct {
  92                 Name *Name
  93                 Expr Expression
  94         }
  95
  96         // A Grammar is a set of EBNF productions. The map
  97         // is indexed by production name.
  98         //
  99         Grammar map[string]*Production
 100 )
 101
 102 func (x Alternative) Pos() token.Pos { return x[0].Pos() } // the parser always generates non-empty Alternative
 103 func (x Sequence) Pos() token.Pos    { return x[0].Pos() } // the parser always generates non-empty Sequences
 104 func (x *Name) Pos() token.Pos       { return x.StringPos }
 105 func (x *Token) Pos() token.Pos      { return x.StringPos }
 106 func (x *Range) Pos() token.Pos      { return x.Begin.Pos() }
 107 func (x *Group) Pos() token.Pos      { return x.Lparen }
 108 func (x *Option) Pos() token.Pos     { return x.Lbrack }
 109 func (x *Repetition) Pos() token.Pos { return x.Lbrace }
 110 func (x *Bad) Pos() token.Pos        { return x.TokPos }
 111 func (x *Production) Pos() token.Pos { return x.Name.Pos() }
 112
 113 // ----------------------------------------------------------------------------
 114 // Grammar verification
 115
 116 func isLexical(name string) bool {
 117         ch, _ := utf8.DecodeRuneInString(name)
 118         return !unicode.IsUpper(ch)
 119 }
 120
 121 type verifier struct {
 122         fset *token.FileSet
 123         scanner.ErrorVector
 124         worklist []*Production
 125         reached  Grammar // set of productions reached from (and including) the root production
 126         grammar  Grammar
 127 }
 128
 129 func (v *verifier) error(pos token.Pos, msg string) {
 130         v.Error(v.fset.Position(pos), msg)
 131 }
 132
 133 func (v *verifier) push(prod *Production) {
 134         name := prod.Name.String
 135         if _, found := v.reached[name]; !found {
 136                 v.worklist = append(v.worklist, prod)
 137                 v.reached[name] = prod
 138         }
 139 }
 140
 141 func (v *verifier) verifyChar(x *Token) int {
 142         s := x.String
 143         if utf8.RuneCountInString(s) != 1 {
 144                 v.error(x.Pos(), "single char expected, found "+s)
 145                 return 0
 146         }
 147         ch, _ := utf8.DecodeRuneInString(s)
 148         return ch
 149 }
 150
 151 func (v *verifier) verifyExpr(expr Expression, lexical bool) {
 152         switch x := expr.(type) {
 153         case nil:
 154                 // empty expression
 155         case Alternative:
 156                 for _, e := range x {
 157                         v.verifyExpr(e, lexical)
 158                 }
 159         case Sequence:
 160                 for _, e := range x {
 161                         v.verifyExpr(e, lexical)
 162                 }
 163         case *Name:
 164                 // a production with this name must exist;
 165                 // add it to the worklist if not yet processed
 166                 if prod, found := v.grammar[x.String]; found {
 167                         v.push(prod)
 168                 } else {
 169                         v.error(x.Pos(), "missing production "+x.String)
 170                 }
 171                 // within a lexical production references
 172                 // to non-lexical productions are invalid
 173                 if lexical && !isLexical(x.String) {
 174                         v.error(x.Pos(), "reference to non-lexical production "+x.String)
 175                 }
 176         case *Token:
 177                 // nothing to do for now
 178         case *Range:
 179                 i := v.verifyChar(x.Begin)
 180                 j := v.verifyChar(x.End)
 181                 if i >= j {
 182                         v.error(x.Pos(), "decreasing character range")
 183                 }
 184         case *Group:
 185                 v.verifyExpr(x.Body, lexical)
 186         case *Option:
 187                 v.verifyExpr(x.Body, lexical)
 188         case *Repetition:
 189                 v.verifyExpr(x.Body, lexical)
 190         default:
 191                 panic("unreachable")
 192         }
 193 }
 194
 195 func (v *verifier) verify(fset *token.FileSet, grammar Grammar, start string) {
 196         // find root production
 197         root, found := grammar[start]
 198         if !found {
 199                 // token.NoPos doesn't require a file set;
 200                 // ok to set v.fset only afterwards
 201                 v.error(token.NoPos, "no start production "+start)
 202                 return
 203         }
 204
 205         // initialize verifier
 206         v.fset = fset
 207         v.ErrorVector.Reset()
 208         v.worklist = v.worklist[0:0]
 209         v.reached = make(Grammar)
 210         v.grammar = grammar
 211
 212         // work through the worklist
 213         v.push(root)
 214         for {
 215                 n := len(v.worklist) - 1
 216                 if n < 0 {
 217                         break
 218                 }
 219                 prod := v.worklist[n]
 220                 v.worklist = v.worklist[0:n]
 221                 v.verifyExpr(prod.Expr, isLexical(prod.Name.String))
 222         }
 223
 224         // check if all productions were reached
 225         if len(v.reached) < len(v.grammar) {
 226                 for name, prod := range v.grammar {
 227                         if _, found := v.reached[name]; !found {
 228                                 v.error(prod.Pos(), name+" is unreachable")
 229                         }
 230                 }
 231         }
 232 }
 233
 234 // Verify checks that:
 235 //      - all productions used are defined
 236 //      - all productions defined are used when beginning at start
 237 //      - lexical productions refer only to other lexical productions
 238 //
 239 // Position information is interpreted relative to the file set fset.
 240 //
 241 func Verify(fset *token.FileSet, grammar Grammar, start string) os.Error {
 242         var v verifier
 243         v.verify(fset, grammar, start)
 244         return v.GetError(scanner.Sorted)
 245 }