// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+// Package syntax parses regular expressions into parse trees and compiles
+// parse trees into programs. Most clients of regular expressions will use
+// the facilities of package regexp (such as Compile and Match) instead of
+// this package.
package syntax
import (
- "os"
"sort"
"strings"
"unicode"
- "utf8"
+ "unicode/utf8"
)
// An Error describes a failure to parse a regular expression
Expr string
}
-func (e *Error) String() string {
+func (e *Error) Error() string {
return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
}
free *Regexp
numCap int // number of capturing groups seen
wholeRegexp string
- tmpClass []int // temporary char class work space
+ tmpClass []rune // temporary char class work space
}
func (p *parser) newRegexp(op Op) *Regexp {
// If r >= 0 and there's a node left over, maybeConcat uses it
// to push r with the given flags.
// maybeConcat reports whether r was pushed.
-func (p *parser) maybeConcat(r int, flags Flags) bool {
+func (p *parser) maybeConcat(r rune, flags Flags) bool {
n := len(p.stack)
if n < 2 {
return false
}
// newLiteral returns a new OpLiteral Regexp with the given flags
-func (p *parser) newLiteral(r int, flags Flags) *Regexp {
+func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
re := p.newRegexp(OpLiteral)
re.Flags = flags
if flags&FoldCase != 0 {
}
// minFoldRune returns the minimum rune fold-equivalent to r.
-func minFoldRune(r int) int {
+func minFoldRune(r rune) rune {
if r < MinFold || r > MaxFold {
return r
}
// literal pushes a literal regexp for the rune r on the stack
// and returns that regexp.
-func (p *parser) literal(r int) {
+func (p *parser) literal(r rune) {
p.push(p.newLiteral(r, p.flags))
}
// before is the regexp suffix starting at the repetition operator.
// after is the regexp suffix following after the repetition operator.
// repeat returns an updated 'after' and an error, if any.
-func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, os.Error) {
+func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
flags := p.flags
if p.flags&PerlX != 0 {
if len(after) > 0 && after[0] == '?' {
}
// Round 1: Factor out common literal prefixes.
- var str []int
+ var str []rune
var strflags Flags
start := 0
out := sub[:0]
//
// Invariant: sub[start:i] consists of regexps that all begin
// with str as modified by strflags.
- var istr []int
+ var istr []rune
var iflags Flags
if i < len(sub) {
istr, iflags = p.leadingString(sub[i])
// leadingString returns the leading literal string that re begins with.
// The string refers to storage in re or its children.
-func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
+func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
if re.Op == OpConcat && len(re.Sub) > 0 {
re = re.Sub[0]
}
for _, c := range s {
if len(re.Rune) >= cap(re.Rune) {
// string is too long to fit in Rune0. let Go handle it
- re.Rune = []int(s)
+ re.Rune = []rune(s)
break
}
re.Rune = append(re.Rune, c)
// Parsing.
-func Parse(s string, flags Flags) (*Regexp, os.Error) {
+// Parse parses a regular expression string s, controlled by the specified
+// Flags, and returns a regular expression parse tree. The syntax is
+// described in the top-level comment for package regexp.
+func Parse(s string, flags Flags) (*Regexp, error) {
if flags&Literal != 0 {
// Trivial parser for literal string.
if err := checkUTF8(s); err != nil {
// Otherwise, must do real work.
var (
p parser
- err os.Error
- c int
+ err error
+ c rune
op Op
lastRepeat string
min, max int
// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
// like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
// The caller must have ensured that s begins with "(?".
-func (p *parser) parsePerlFlags(s string) (rest string, err os.Error) {
+func (p *parser) parsePerlFlags(s string) (rest string, err error) {
t := s
// Check for named captures, first introduced in Python's regexp library.
}
// Non-capturing group. Might also twiddle Perl flags.
- var c int
+ var c rune
t = t[2:] // skip (?
flags := p.flags
sign := +1
}
// does re match r?
-func matchRune(re *Regexp, r int) bool {
+func matchRune(re *Regexp, r rune) bool {
switch re.Op {
case OpLiteral:
return len(re.Rune) == 1 && re.Rune[0] == r
}
// parseVerticalBar handles a | in the input.
-func (p *parser) parseVerticalBar() os.Error {
+func (p *parser) parseVerticalBar() error {
p.concat()
// The concatenation we just parsed is on top of the stack.
}
// parseRightParen handles a ) in the input.
-func (p *parser) parseRightParen() os.Error {
+func (p *parser) parseRightParen() error {
p.concat()
if p.swapVerticalBar() {
// pop vertical bar
// parseEscape parses an escape sequence at the beginning of s
// and returns the rune.
-func (p *parser) parseEscape(s string) (r int, rest string, err os.Error) {
+func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
t := s[1:]
if t == "" {
return 0, "", &Error{ErrTrailingBackslash, ""}
if t == "" || t[0] < '0' || t[0] > '7' {
break
}
- r = r*8 + int(t[0]) - '0'
+ r = r*8 + rune(t[0]) - '0'
t = t[1:]
}
return r, t, nil
// parseClassChar parses a character class character at the beginning of s
// and returns it.
-func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) {
+func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
if s == "" {
return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
}
type charGroup struct {
sign int
- class []int
+ class []rune
}
// parsePerlClassEscape parses a leading Perl character class escape like \d
// from the beginning of s. If one is present, it appends the characters to r
// and returns the new slice r and the remainder of the string.
-func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string) {
+func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
return
}
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
// from the beginning of s. If one is present, it appends the characters to r
// and returns the new slice r and the remainder of the string.
-func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err os.Error) {
+func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
if len(s) < 2 || s[0] != '[' || s[1] != ':' {
return
}
return p.appendGroup(r, g), s, nil
}
-func (p *parser) appendGroup(r []int, g charGroup) []int {
+func (p *parser) appendGroup(r []rune, g charGroup) []rune {
if p.flags&FoldCase == 0 {
if g.sign < 0 {
r = appendNegatedClass(r, g.class)
}
var anyTable = &unicode.RangeTable{
- []unicode.Range16{{0, 1<<16 - 1, 1}},
- []unicode.Range32{{1 << 16, unicode.MaxRune, 1}},
+ R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
+ R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
}
// unicodeTable returns the unicode.RangeTable identified by name
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
// from the beginning of s. If one is present, it appends the characters to r
// and returns the new slice r and the remainder of the string.
-func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, err os.Error) {
+func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
return
}
// parseClass parses a character class at the beginning of s
// and pushes it onto the parse stack.
-func (p *parser) parseClass(s string) (rest string, err os.Error) {
+func (p *parser) parseClass(s string) (rest string, err error) {
t := s[1:] // chop [
re := p.newRegexp(OpCharClass)
re.Flags = p.flags
// Single character or simple range.
rng := t
- var lo, hi int
+ var lo, hi rune
if lo, t, err = p.parseClassChar(t, s); err != nil {
return "", err
}
// cleanClass sorts the ranges (pairs of elements of r),
// merges them, and eliminates duplicates.
-func cleanClass(rp *[]int) []int {
+func cleanClass(rp *[]rune) []rune {
// Sort by lo increasing, hi decreasing to break ties.
sort.Sort(ranges{rp})
}
// appendLiteral returns the result of appending the literal x to the class r.
-func appendLiteral(r []int, x int, flags Flags) []int {
+func appendLiteral(r []rune, x rune, flags Flags) []rune {
if flags&FoldCase != 0 {
return appendFoldedRange(r, x, x)
}
return AppendRange(r, x, x)
}
-// AppendRange returns the result of appending the range lo-hi to the class r.
-func AppendRange(r []int, lo, hi int) []int {
+// appendRange returns the result of appending the range lo-hi to the class r.
+func AppendRange(r []rune, lo, hi rune) []rune {
// Expand last range or next to last range if it overlaps or abuts.
// Checking two ranges helps when appending case-folded
// alphabets, so that one range can be expanding A-Z and the
// appendFoldedRange returns the result of appending the range lo-hi
// and its case folding-equivalent runes to the class r.
-func appendFoldedRange(r []int, lo, hi int) []int {
+func appendFoldedRange(r []rune, lo, hi rune) []rune {
// Optimizations.
if lo <= MinFold && hi >= MaxFold {
// Range is full: folding can't add more.
// appendClass returns the result of appending the class x to the class r.
// It assume x is clean.
-func appendClass(r []int, x []int) []int {
+func appendClass(r []rune, x []rune) []rune {
for i := 0; i < len(x); i += 2 {
r = AppendRange(r, x[i], x[i+1])
}
}
// appendFolded returns the result of appending the case folding of the class x to the class r.
-func appendFoldedClass(r []int, x []int) []int {
+func appendFoldedClass(r []rune, x []rune) []rune {
for i := 0; i < len(x); i += 2 {
r = appendFoldedRange(r, x[i], x[i+1])
}
// appendNegatedClass returns the result of appending the negation of the class x to the class r.
// It assumes x is clean.
-func appendNegatedClass(r []int, x []int) []int {
- nextLo := 0
+func appendNegatedClass(r []rune, x []rune) []rune {
+ nextLo := '\u0000'
for i := 0; i < len(x); i += 2 {
lo, hi := x[i], x[i+1]
if nextLo <= lo-1 {
}
// appendTable returns the result of appending x to the class r.
-func appendTable(r []int, x *unicode.RangeTable) []int {
+func appendTable(r []rune, x *unicode.RangeTable) []rune {
for _, xr := range x.R16 {
- lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
if stride == 1 {
r = AppendRange(r, lo, hi)
continue
}
}
for _, xr := range x.R32 {
- lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
if stride == 1 {
r = AppendRange(r, lo, hi)
continue
}
// appendNegatedTable returns the result of appending the negation of x to the class r.
-func appendNegatedTable(r []int, x *unicode.RangeTable) []int {
- nextLo := 0 // lo end of next class to add
+func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
+ nextLo := '\u0000' // lo end of next class to add
for _, xr := range x.R16 {
- lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
if stride == 1 {
if nextLo <= lo-1 {
r = AppendRange(r, nextLo, lo-1)
}
}
for _, xr := range x.R32 {
- lo, hi, stride := int(xr.Lo), int(xr.Hi), int(xr.Stride)
+ lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
if stride == 1 {
if nextLo <= lo-1 {
r = AppendRange(r, nextLo, lo-1)
// negateClass overwrites r and returns r's negation.
// It assumes the class r is already clean.
-func negateClass(r []int) []int {
- nextLo := 0 // lo end of next class to add
- w := 0 // write index
+func negateClass(r []rune) []rune {
+ nextLo := '\u0000' // lo end of next class to add
+ w := 0 // write index
for i := 0; i < len(r); i += 2 {
lo, hi := r[i], r[i+1]
if nextLo <= lo-1 {
// ranges implements sort.Interface on a []rune.
// The choice of receiver type definition is strange
// but avoids an allocation since we already have
-// a *[]int.
+// a *[]rune.
type ranges struct {
- p *[]int
+ p *[]rune
}
func (ra ranges) Less(i, j int) bool {
p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
}
-func checkUTF8(s string) os.Error {
+func checkUTF8(s string) error {
for s != "" {
rune, size := utf8.DecodeRuneInString(s)
if rune == utf8.RuneError && size == 1 {
return nil
}
-func nextRune(s string) (c int, t string, err os.Error) {
+func nextRune(s string) (c rune, t string, err error) {
c, size := utf8.DecodeRuneInString(s)
if c == utf8.RuneError && size == 1 {
return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
return c, s[size:], nil
}
-func isalnum(c int) bool {
+func isalnum(c rune) bool {
return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
}
-func unhex(c int) int {
+func unhex(c rune) rune {
if '0' <= c && c <= '9' {
return c - '0'
}