1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package syntax parses regular expressions into syntax trees.
9 // Note to implementers:
10 // In this package, re is always a *Regexp and r is always a rune.
19 // A Regexp is a node in a regular expression syntax tree.
23 Sub []*Regexp // subexpressions, if any
24 Sub0 [1]*Regexp // storage for short Sub
25 Rune []rune // matched runes, for OpLiteral, OpCharClass
26 Rune0 [2]rune // storage for short Rune
27 Min, Max int // min, max for OpRepeat
28 Cap int // capturing index, for OpCapture
29 Name string // capturing name, for OpCapture
32 // An Op is a single regular expression operator.
35 // Operators are listed in precedence order, tightest binding to weakest.
36 // Character class operators are listed simplest to most complex
37 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
40 OpNoMatch Op = 1 + iota // matches no strings
41 OpEmptyMatch // matches empty string
42 OpLiteral // matches Runes sequence
43 OpCharClass // matches Runes interpreted as range pair list
44 OpAnyCharNotNL // matches any character
45 OpAnyChar // matches any character
46 OpBeginLine // matches empty string at beginning of line
47 OpEndLine // matches empty string at end of line
48 OpBeginText // matches empty string at beginning of text
49 OpEndText // matches empty string at end of text
50 OpWordBoundary // matches word boundary `\b`
51 OpNoWordBoundary // matches word non-boundary `\B`
52 OpCapture // capturing subexpression with index Cap, optional name Name
53 OpStar // matches Sub[0] zero or more times
54 OpPlus // matches Sub[0] one or more times
55 OpQuest // matches Sub[0] zero or one times
56 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
57 OpConcat // matches concatenation of Subs
58 OpAlternate // matches alternation of Subs
61 const opPseudo Op = 128 // where pseudo-ops start
63 // Equal returns true if x and y have identical structure.
64 func (x *Regexp) Equal(y *Regexp) bool {
65 if x == nil || y == nil {
73 // The parse flags remember whether this is \z or \Z.
74 if x.Flags&WasDollar != y.Flags&WasDollar {
78 case OpLiteral, OpCharClass:
79 if len(x.Rune) != len(y.Rune) {
82 for i, r := range x.Rune {
88 case OpAlternate, OpConcat:
89 if len(x.Sub) != len(y.Sub) {
92 for i, sub := range x.Sub {
93 if !sub.Equal(y.Sub[i]) {
98 case OpStar, OpPlus, OpQuest:
99 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
104 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
109 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
116 // writeRegexp writes the Perl syntax for the regular expression re to b.
117 func writeRegexp(b *bytes.Buffer, re *Regexp) {
120 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
122 b.WriteString(`[^\x00-\x{10FFFF}]`)
124 b.WriteString(`(?:)`)
126 if re.Flags&FoldCase != 0 {
127 b.WriteString(`(?i:`)
129 for _, r := range re.Rune {
132 if re.Flags&FoldCase != 0 {
136 if len(re.Rune)%2 != 0 {
137 b.WriteString(`[invalid char class]`)
141 if len(re.Rune) == 0 {
142 b.WriteString(`^\x00-\x{10FFFF}`)
143 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
144 // Contains 0 and MaxRune. Probably a negated class.
147 for i := 1; i < len(re.Rune)-1; i += 2 {
148 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
149 escape(b, lo, lo == '-')
152 escape(b, hi, hi == '-')
156 for i := 0; i < len(re.Rune); i += 2 {
157 lo, hi := re.Rune[i], re.Rune[i+1]
158 escape(b, lo, lo == '-')
161 escape(b, hi, hi == '-')
167 b.WriteString(`(?-s:.)`)
169 b.WriteString(`(?s:.)`)
177 if re.Flags&WasDollar != 0 {
178 b.WriteString(`(?-m:$)`)
184 case OpNoWordBoundary:
188 b.WriteString(`(?P<`)
189 b.WriteString(re.Name)
194 if re.Sub[0].Op != OpEmptyMatch {
195 writeRegexp(b, re.Sub[0])
198 case OpStar, OpPlus, OpQuest, OpRepeat:
199 if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
215 b.WriteString(strconv.Itoa(re.Min))
216 if re.Max != re.Min {
219 b.WriteString(strconv.Itoa(re.Max))
224 if re.Flags&NonGreedy != 0 {
228 for _, sub := range re.Sub {
229 if sub.Op == OpAlternate {
238 for i, sub := range re.Sub {
247 func (re *Regexp) String() string {
253 const meta = `\.+*?()|[]{}^$`
255 func escape(b *bytes.Buffer, r rune, force bool) {
256 if unicode.IsPrint(r) {
257 if strings.IndexRune(meta, r) >= 0 || force {
280 s := strconv.Itob(int(r), 16)
288 b.WriteString(strconv.Itob(int(r), 16))
293 // MaxCap walks the regexp to find the maximum capture index.
294 func (re *Regexp) MaxCap() int {
296 if re.Op == OpCapture {
299 for _, sub := range re.Sub {
300 if n := sub.MaxCap(); m < n {