1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 // An Error describes a failure to parse a regular expression
15 // and gives the offending expression.
21 func (e *Error) Error() string {
22 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
25 // An ErrorCode describes a failure to parse a regular expression.
30 ErrInternalError ErrorCode = "regexp/syntax: internal error"
33 ErrInvalidCharClass ErrorCode = "invalid character class"
34 ErrInvalidCharRange ErrorCode = "invalid character class range"
35 ErrInvalidEscape ErrorCode = "invalid escape sequence"
36 ErrInvalidNamedCapture ErrorCode = "invalid named capture"
37 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
38 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
39 ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
40 ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
41 ErrMissingBracket ErrorCode = "missing closing ]"
42 ErrMissingParen ErrorCode = "missing closing )"
43 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
44 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
47 func (e ErrorCode) String() string {
51 // Flags control the behavior of the parser and record information about regexp context.
55 FoldCase Flags = 1 << iota // case-insensitive match
56 Literal // treat pattern as literal string
57 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
58 DotNL // allow . to match newline
59 OneLine // treat ^ and $ as only matching at beginning and end of text
60 NonGreedy // make repetition operators default to non-greedy
61 PerlX // allow Perl extensions
62 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
63 WasDollar // regexp OpEndText was $, not \z
64 Simple // regexp contains no counted repetition
66 MatchNL = ClassNL | DotNL
68 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
69 POSIX Flags = 0 // POSIX syntax
72 // Pseudo-ops for parsing stack.
74 opLeftParen = opPseudo + iota
79 flags Flags // parse mode flags
80 stack []*Regexp // stack of parsed expressions
82 numCap int // number of capturing groups seen
84 tmpClass []rune // temporary char class work space
87 func (p *parser) newRegexp(op Op) *Regexp {
99 func (p *parser) reuse(re *Regexp) {
104 // Parse stack manipulation.
106 // push pushes the regexp re onto the parse stack and returns the regexp.
107 func (p *parser) push(re *Regexp) *Regexp {
108 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
110 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
114 re.Rune = re.Rune[:1]
115 re.Flags = p.flags &^ FoldCase
116 } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
117 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
118 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
119 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
120 re.Op == OpCharClass && len(re.Rune) == 2 &&
121 re.Rune[0]+1 == re.Rune[1] &&
122 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
123 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
124 // Case-insensitive rune like [Aa] or [Δδ].
125 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
129 // Rewrite as (case-insensitive) literal.
131 re.Rune = re.Rune[:1]
132 re.Flags = p.flags | FoldCase
134 // Incremental concatenation.
138 p.stack = append(p.stack, re)
142 // maybeConcat implements incremental concatenation
143 // of literal runes into string nodes. The parser calls this
144 // before each push, so only the top fragment of the stack
145 // might need processing. Since this is called before a push,
146 // the topmost literal is no longer subject to operators like *
147 // (Otherwise ab* would turn into (ab)*.)
148 // If r >= 0 and there's a node left over, maybeConcat uses it
149 // to push r with the given flags.
150 // maybeConcat reports whether r was pushed.
151 func (p *parser) maybeConcat(r rune, flags Flags) bool {
159 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
163 // Push re1 into re2.
164 re2.Rune = append(re2.Rune, re1.Rune...)
166 // Reuse re1 if possible.
168 re1.Rune = re1.Rune0[:1]
174 p.stack = p.stack[:n-1]
176 return false // did not push r
179 // newLiteral returns a new OpLiteral Regexp with the given flags
180 func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
181 re := p.newRegexp(OpLiteral)
183 if flags&FoldCase != 0 {
187 re.Rune = re.Rune0[:1]
191 // minFoldRune returns the minimum rune fold-equivalent to r.
192 func minFoldRune(r rune) rune {
193 if r < MinFold || r > MaxFold {
198 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
206 // literal pushes a literal regexp for the rune r on the stack
207 // and returns that regexp.
208 func (p *parser) literal(r rune) {
209 p.push(p.newLiteral(r, p.flags))
212 // op pushes a regexp with the given op onto the stack
213 // and returns that regexp.
214 func (p *parser) op(op Op) *Regexp {
215 re := p.newRegexp(op)
220 // repeat replaces the top stack element with itself repeated according to op, min, max.
221 // before is the regexp suffix starting at the repetition operator.
222 // after is the regexp suffix following after the repetition operator.
223 // repeat returns an updated 'after' and an error, if any.
224 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
226 if p.flags&PerlX != 0 {
227 if len(after) > 0 && after[0] == '?' {
231 if lastRepeat != "" {
232 // In Perl it is not allowed to stack repetition operators:
233 // a** is a syntax error, not a doubled star, and a++ means
234 // something else entirely, which we don't support!
235 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
240 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
243 if sub.Op >= opPseudo {
244 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
246 re := p.newRegexp(op)
256 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
257 func (p *parser) concat() *Regexp {
260 // Scan down to find pseudo-operator | or (.
262 for i > 0 && p.stack[i-1].Op < opPseudo {
266 p.stack = p.stack[:i]
268 // Empty concatenation is special case.
270 return p.push(p.newRegexp(OpEmptyMatch))
273 return p.push(p.collapse(subs, OpConcat))
276 // alternate replaces the top of the stack (above the topmost '(') with its alternation.
277 func (p *parser) alternate() *Regexp {
278 // Scan down to find pseudo-operator (.
279 // There are no | above (.
281 for i > 0 && p.stack[i-1].Op < opPseudo {
285 p.stack = p.stack[:i]
287 // Make sure top class is clean.
288 // All the others already are (see swapVerticalBar).
290 cleanAlt(subs[len(subs)-1])
293 // Empty alternate is special case
294 // (shouldn't happen but easy to handle).
296 return p.push(p.newRegexp(OpNoMatch))
299 return p.push(p.collapse(subs, OpAlternate))
302 // cleanAlt cleans re for eventual inclusion in an alternation.
303 func cleanAlt(re *Regexp) {
306 re.Rune = cleanClass(&re.Rune)
307 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
312 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
314 re.Op = OpAnyCharNotNL
317 if cap(re.Rune)-len(re.Rune) > 100 {
318 // re.Rune will not grow any more.
319 // Make a copy or inline to reclaim storage.
320 re.Rune = append(re.Rune0[:0], re.Rune...)
325 // collapse returns the result of applying op to sub.
326 // If sub contains op nodes, they all get hoisted up
327 // so that there is never a concat of a concat or an
328 // alternate of an alternate.
329 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
333 re := p.newRegexp(op)
335 for _, sub := range subs {
337 re.Sub = append(re.Sub, sub.Sub...)
340 re.Sub = append(re.Sub, sub)
343 if op == OpAlternate {
344 re.Sub = p.factor(re.Sub, re.Flags)
345 if len(re.Sub) == 1 {
354 // factor factors common prefixes from the alternation list sub.
355 // It returns a replacement list that reuses the same storage and
356 // frees (passes to p.reuse) any removed *Regexps.
359 // ABC|ABD|AEF|BCX|BCY
360 // simplifies by literal prefix extraction to
361 // A(B(C|D)|EF)|BC(X|Y)
362 // which simplifies by character class introduction to
363 // A(B[CD]|EF)|BC[XY]
365 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
370 // Round 1: Factor out common literal prefixes.
375 for i := 0; i <= len(sub); i++ {
376 // Invariant: the Regexps that were in sub[0:start] have been
377 // used or marked for reuse, and the slice space has been reused
378 // for out (len(out) <= start).
380 // Invariant: sub[start:i] consists of regexps that all begin
381 // with str as modified by strflags.
385 istr, iflags = p.leadingString(sub[i])
386 if iflags == strflags {
388 for same < len(str) && same < len(istr) && str[same] == istr[same] {
392 // Matches at least one rune in current range.
393 // Keep going around.
400 // Found end of a run with common leading literal string:
401 // sub[start:i] all begin with str[0:len(str)], but sub[i]
402 // does not even begin with str[0].
404 // Factor out common string and append factored expression to out.
406 // Nothing to do - run of length 0.
407 } else if i == start+1 {
408 // Just one: don't bother factoring.
409 out = append(out, sub[start])
411 // Construct factored form: prefix(suffix1|suffix2|...)
412 prefix := p.newRegexp(OpLiteral)
413 prefix.Flags = strflags
414 prefix.Rune = append(prefix.Rune[:0], str...)
416 for j := start; j < i; j++ {
417 sub[j] = p.removeLeadingString(sub[j], len(str))
419 suffix := p.collapse(sub[start:i], OpAlternate) // recurse
421 re := p.newRegexp(OpConcat)
422 re.Sub = append(re.Sub[:0], prefix, suffix)
423 out = append(out, re)
426 // Prepare for next iteration.
433 // Round 2: Factor out common complex prefixes,
434 // just the first piece of each concatenation,
435 // whatever it is. This is good enough a lot of the time.
439 for i := 0; i <= len(sub); i++ {
440 // Invariant: the Regexps that were in sub[0:start] have been
441 // used or marked for reuse, and the slice space has been reused
442 // for out (len(out) <= start).
444 // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
447 ifirst = p.leadingRegexp(sub[i])
448 if first != nil && first.Equal(ifirst) {
453 // Found end of a run with common leading regexp:
454 // sub[start:i] all begin with first but sub[i] does not.
456 // Factor out common regexp and append factored expression to out.
458 // Nothing to do - run of length 0.
459 } else if i == start+1 {
460 // Just one: don't bother factoring.
461 out = append(out, sub[start])
463 // Construct factored form: prefix(suffix1|suffix2|...)
465 for j := start; j < i; j++ {
466 reuse := j != start // prefix came from sub[start]
467 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
469 suffix := p.collapse(sub[start:i], OpAlternate) // recurse
471 re := p.newRegexp(OpConcat)
472 re.Sub = append(re.Sub[:0], prefix, suffix)
473 out = append(out, re)
476 // Prepare for next iteration.
482 // Round 3: Collapse runs of single literals into character classes.
485 for i := 0; i <= len(sub); i++ {
486 // Invariant: the Regexps that were in sub[0:start] have been
487 // used or marked for reuse, and the slice space has been reused
488 // for out (len(out) <= start).
490 // Invariant: sub[start:i] consists of regexps that are either
491 // literal runes or character classes.
492 if i < len(sub) && isCharClass(sub[i]) {
496 // sub[i] is not a char or char class;
497 // emit char class for sub[start:i]...
499 // Nothing to do - run of length 0.
500 } else if i == start+1 {
501 out = append(out, sub[start])
503 // Make new char class.
504 // Start with most complex regexp in sub[start].
506 for j := start + 1; j < i; j++ {
507 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
511 sub[start], sub[max] = sub[max], sub[start]
513 for j := start + 1; j < i; j++ {
514 mergeCharClass(sub[start], sub[j])
518 out = append(out, sub[start])
521 // ... and then emit sub[i].
523 out = append(out, sub[i])
529 // Round 4: Collapse runs of empty matches into a single empty match.
533 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
536 out = append(out, sub[i])
543 // leadingString returns the leading literal string that re begins with.
544 // The string refers to storage in re or its children.
545 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
546 if re.Op == OpConcat && len(re.Sub) > 0 {
549 if re.Op != OpLiteral {
552 return re.Rune, re.Flags & FoldCase
555 // removeLeadingString removes the first n leading runes
556 // from the beginning of re. It returns the replacement for re.
557 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
558 if re.Op == OpConcat && len(re.Sub) > 0 {
559 // Removing a leading string in a concatenation
560 // might simplify the concatenation.
562 sub = p.removeLeadingString(sub, n)
564 if sub.Op == OpEmptyMatch {
568 // Impossible but handle.
576 copy(re.Sub, re.Sub[1:])
577 re.Sub = re.Sub[:len(re.Sub)-1]
583 if re.Op == OpLiteral {
584 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
585 if len(re.Rune) == 0 {
592 // leadingRegexp returns the leading regexp that re begins with.
593 // The regexp refers to storage in re or its children.
594 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
595 if re.Op == OpEmptyMatch {
598 if re.Op == OpConcat && len(re.Sub) > 0 {
600 if sub.Op == OpEmptyMatch {
608 // removeLeadingRegexp removes the leading regexp in re.
609 // It returns the replacement for re.
610 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
611 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
612 if re.Op == OpConcat && len(re.Sub) > 0 {
616 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
631 return p.newRegexp(OpEmptyMatch)
634 func literalRegexp(s string, flags Flags) *Regexp {
635 re := &Regexp{Op: OpLiteral}
637 re.Rune = re.Rune0[:0] // use local storage for small strings
638 for _, c := range s {
639 if len(re.Rune) >= cap(re.Rune) {
640 // string is too long to fit in Rune0. let Go handle it
644 re.Rune = append(re.Rune, c)
651 func Parse(s string, flags Flags) (*Regexp, error) {
652 if flags&Literal != 0 {
653 // Trivial parser for literal string.
654 if err := checkUTF8(s); err != nil {
657 return literalRegexp(s, flags), nil
660 // Otherwise, must do real work.
677 if c, t, err = nextRune(t); err != nil {
683 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
684 // Flag changes and non-capturing groups.
685 if t, err = p.parsePerlFlags(t); err != nil {
691 p.op(opLeftParen).Cap = p.numCap
694 if err = p.parseVerticalBar(); err != nil {
699 if err = p.parseRightParen(); err != nil {
704 if p.flags&OneLine != 0 {
711 if p.flags&OneLine != 0 {
712 p.op(OpEndText).Flags |= WasDollar
718 if p.flags&DotNL != 0 {
725 if t, err = p.parseClass(t); err != nil {
739 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
747 min, max, after, ok := p.parseRepeat(t)
749 // If the repeat cannot be parsed, { is a literal.
754 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
755 // Numbers were too big, or max is present and min > max.
756 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
758 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
764 if p.flags&PerlX != 0 && len(t) >= 2 {
775 p.op(OpNoWordBoundary)
779 // any byte; not supported
780 return nil, &Error{ErrInvalidEscape, t[:2]}
782 // \Q ... \E: the ... is always literals
784 if i := strings.Index(t, `\E`); i < 0 {
791 p.push(literalRegexp(lit, p.flags))
800 re := p.newRegexp(OpCharClass)
803 // Look for Unicode character group like \p{Han}
804 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
805 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
817 // Perl character class escape.
818 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
826 // Ordinary single-character escape.
827 if c, t, err = p.parseEscape(t); err != nil {
836 if p.swapVerticalBar() {
838 p.stack = p.stack[:len(p.stack)-1]
844 return nil, &Error{ErrMissingParen, s}
846 return p.stack[0], nil
849 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
850 // If s is not of that form, it returns ok == false.
851 // If s has the right form but the values are too big, it returns min == -1, ok == true.
852 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
853 if s == "" || s[0] != '{' {
858 if min, s, ok1 = p.parseInt(s); !ok1 {
873 } else if max, s, ok1 = p.parseInt(s); !ok1 {
876 // parseInt found too big a number
880 if s == "" || s[0] != '}' {
888 // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
889 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
890 // The caller must have ensured that s begins with "(?".
891 func (p *parser) parsePerlFlags(s string) (rest string, err error) {
894 // Check for named captures, first introduced in Python's regexp library.
895 // As usual, there are three slightly different syntaxes:
897 // (?P<name>expr) the original, introduced by Python
898 // (?<name>expr) the .NET alteration, adopted by Perl 5.10
899 // (?'name'expr) another .NET alteration, adopted by Perl 5.10
901 // Perl 5.10 gave in and implemented the Python version too,
902 // but they claim that the last two are the preferred forms.
903 // PCRE and languages based on it (specifically, PHP and Ruby)
904 // support all three as well. EcmaScript 4 uses only the Python form.
906 // In both the open source world (via Code Search) and the
907 // Google source tree, (?P<expr>name) is the dominant form,
908 // so that's the one we implement. One is enough.
909 if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
911 end := strings.IndexRune(t, '>')
913 if err = checkUTF8(t); err != nil {
916 return "", &Error{ErrInvalidNamedCapture, s}
919 capture := t[:end+1] // "(?P<name>"
920 name := t[4:end] // "name"
921 if err = checkUTF8(name); err != nil {
924 if !isValidCaptureName(name) {
925 return "", &Error{ErrInvalidNamedCapture, capture}
928 // Like ordinary capture, but named.
930 re := p.op(opLeftParen)
933 return t[end+1:], nil
936 // Non-capturing group. Might also twiddle Perl flags.
944 if c, t, err = nextRune(t); err != nil {
965 // Switch to negation.
971 // Invert flags so that | above turn into &^ and vice versa.
972 // We'll invert flags again before using it below.
976 // End of flags, starting group or not.
993 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
996 // isValidCaptureName reports whether name
997 // is a valid capture name: [A-Za-z0-9_]+.
998 // PCRE limits names to 32 bytes.
999 // Python rejects names starting with digits.
1000 // We don't enforce either of those.
1001 func isValidCaptureName(name string) bool {
1005 for _, c := range name {
1006 if c != '_' && !isalnum(c) {
1013 // parseInt parses a decimal integer.
1014 func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1015 if s == "" || s[0] < '0' || '9' < s[0] {
1018 // Disallow leading zeros.
1019 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1023 for s != "" && '0' <= s[0] && s[0] <= '9' {
1028 // Have digits, compute value.
1029 t = t[:len(t)-len(s)]
1030 for i := 0; i < len(t); i++ {
1036 n = n*10 + int(t[i]) - '0'
1041 // can this be represented as a character class?
1042 // single-rune literal string, char class, ., and .|\n.
1043 func isCharClass(re *Regexp) bool {
1044 return re.Op == OpLiteral && len(re.Rune) == 1 ||
1045 re.Op == OpCharClass ||
1046 re.Op == OpAnyCharNotNL ||
1051 func matchRune(re *Regexp, r rune) bool {
1054 return len(re.Rune) == 1 && re.Rune[0] == r
1056 for i := 0; i < len(re.Rune); i += 2 {
1057 if re.Rune[i] <= r && r <= re.Rune[i+1] {
1062 case OpAnyCharNotNL:
1070 // parseVerticalBar handles a | in the input.
1071 func (p *parser) parseVerticalBar() error {
1074 // The concatenation we just parsed is on top of the stack.
1075 // If it sits above an opVerticalBar, swap it below
1076 // (things below an opVerticalBar become an alternation).
1077 // Otherwise, push a new vertical bar.
1078 if !p.swapVerticalBar() {
1085 // mergeCharClass makes dst = dst|src.
1086 // The caller must ensure that dst.Op >= src.Op,
1087 // to reduce the amount of copying.
1088 func mergeCharClass(dst, src *Regexp) {
1091 // src doesn't add anything.
1092 case OpAnyCharNotNL:
1094 if matchRune(src, '\n') {
1098 // src is simpler, so either literal or char class
1099 if src.Op == OpLiteral {
1100 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1102 dst.Rune = appendClass(dst.Rune, src.Rune)
1106 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1109 dst.Op = OpCharClass
1110 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1111 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1115 // If the top of the stack is an element followed by an opVerticalBar
1116 // swapVerticalBar swaps the two and returns true.
1117 // Otherwise it returns false.
1118 func (p *parser) swapVerticalBar() bool {
1119 // If above and below vertical bar are literal or char class,
1120 // can merge into a single char class.
1122 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1125 // Make re3 the more complex of the two.
1126 if re1.Op > re3.Op {
1130 mergeCharClass(re3, re1)
1132 p.stack = p.stack[:n-1]
1139 if re2.Op == opVerticalBar {
1141 // Now out of reach.
1142 // Clean opportunistically.
1143 cleanAlt(p.stack[n-3])
1153 // parseRightParen handles a ) in the input.
1154 func (p *parser) parseRightParen() error {
1156 if p.swapVerticalBar() {
1158 p.stack = p.stack[:len(p.stack)-1]
1164 return &Error{ErrInternalError, ""}
1168 p.stack = p.stack[:n-2]
1169 if re2.Op != opLeftParen {
1170 return &Error{ErrMissingParen, p.wholeRegexp}
1172 // Restore flags at time of paren.
1175 // Just for grouping.
1179 re2.Sub = re2.Sub0[:1]
1186 // parseEscape parses an escape sequence at the beginning of s
1187 // and returns the rune.
1188 func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1191 return 0, "", &Error{ErrTrailingBackslash, ""}
1193 c, t, err := nextRune(t)
1201 if c < utf8.RuneSelf && !isalnum(c) {
1202 // Escaped non-word characters are always themselves.
1203 // PCRE is not quite so rigorous: it accepts things like
1204 // \q, but we don't. We once rejected \_, but too many
1205 // programs and people insist on using it, so allow \_.
1210 case '1', '2', '3', '4', '5', '6', '7':
1211 // Single non-zero digit is a backreference; not supported
1212 if t == "" || t[0] < '0' || t[0] > '7' {
1217 // Consume up to three octal digits; already have one.
1219 for i := 1; i < 3; i++ {
1220 if t == "" || t[0] < '0' || t[0] > '7' {
1223 r = r*8 + rune(t[0]) - '0'
1228 // Hexadecimal escapes.
1233 if c, t, err = nextRune(t); err != nil {
1237 // Any number of digits in braces.
1238 // Perl accepts any text at all; it ignores all text
1239 // after the first non-hex digit. We require only hex digits,
1240 // and at least one.
1247 if c, t, err = nextRune(t); err != nil {
1258 if r > unicode.MaxRune {
1269 // Easy case: two hex digits.
1271 if c, t, err = nextRune(t); err != nil {
1278 return x*16 + y, t, nil
1280 // C escapes. There is no case 'b', to avoid misparsing
1281 // the Perl word-boundary \b as the C backspace \b
1282 // when in POSIX mode. In Perl, /\b/ means word-boundary
1283 // but /[\b]/ means backspace. We don't support that.
1284 // If you want a backspace, embed a literal backspace
1285 // character or use \x08.
1299 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1302 // parseClassChar parses a character class character at the beginning of s
1304 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1306 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1309 // Allow regular escape sequences even though
1310 // many need not be escaped in this context.
1312 return p.parseEscape(s)
1318 type charGroup struct {
1323 // parsePerlClassEscape parses a leading Perl character class escape like \d
1324 // from the beginning of s. If one is present, it appends the characters to r
1325 // and returns the new slice r and the remainder of the string.
1326 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1327 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1330 g := perlGroup[s[0:2]]
1334 return p.appendGroup(r, g), s[2:]
1337 // parseNamedClass parses a leading POSIX named character class like [:alnum:]
1338 // from the beginning of s. If one is present, it appends the characters to r
1339 // and returns the new slice r and the remainder of the string.
1340 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1341 if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1345 i := strings.Index(s[2:], ":]")
1350 name, s := s[0:i+2], s[i+2:]
1351 g := posixGroup[name]
1353 return nil, "", &Error{ErrInvalidCharRange, name}
1355 return p.appendGroup(r, g), s, nil
1358 func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1359 if p.flags&FoldCase == 0 {
1361 r = appendNegatedClass(r, g.class)
1363 r = appendClass(r, g.class)
1366 tmp := p.tmpClass[:0]
1367 tmp = appendFoldedClass(tmp, g.class)
1369 tmp = cleanClass(&p.tmpClass)
1371 r = appendNegatedClass(r, tmp)
1373 r = appendClass(r, tmp)
1379 var anyTable = &unicode.RangeTable{
1380 []unicode.Range16{{0, 1<<16 - 1, 1}},
1381 []unicode.Range32{{1 << 16, unicode.MaxRune, 1}},
1384 // unicodeTable returns the unicode.RangeTable identified by name
1385 // and the table of additional fold-equivalent code points.
1386 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1387 // Special case: "Any" means any.
1389 return anyTable, anyTable
1391 if t := unicode.Categories[name]; t != nil {
1392 return t, unicode.FoldCategory[name]
1394 if t := unicode.Scripts[name]; t != nil {
1395 return t, unicode.FoldScript[name]
1400 // parseUnicodeClass parses a leading Unicode character class like \p{Han}
1401 // from the beginning of s. If one is present, it appends the characters to r
1402 // and returns the new slice r and the remainder of the string.
1403 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1404 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1408 // Committed to parse or return error.
1414 c, t, err := nextRune(t)
1418 var seq, name string
1420 // Single-letter name.
1421 seq = s[:len(s)-len(t)]
1424 // Name is in braces.
1425 end := strings.IndexRune(s, '}')
1427 if err = checkUTF8(s); err != nil {
1430 return nil, "", &Error{ErrInvalidCharRange, s}
1432 seq, t = s[:end+1], s[end+1:]
1434 if err = checkUTF8(name); err != nil {
1439 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1440 if name != "" && name[0] == '^' {
1445 tab, fold := unicodeTable(name)
1447 return nil, "", &Error{ErrInvalidCharRange, seq}
1450 if p.flags&FoldCase == 0 || fold == nil {
1452 r = appendTable(r, tab)
1454 r = appendNegatedTable(r, tab)
1457 // Merge and clean tab and fold in a temporary buffer.
1458 // This is necessary for the negative case and just tidy
1459 // for the positive case.
1460 tmp := p.tmpClass[:0]
1461 tmp = appendTable(tmp, tab)
1462 tmp = appendTable(tmp, fold)
1464 tmp = cleanClass(&p.tmpClass)
1466 r = appendClass(r, tmp)
1468 r = appendNegatedClass(r, tmp)
1474 // parseClass parses a character class at the beginning of s
1475 // and pushes it onto the parse stack.
1476 func (p *parser) parseClass(s string) (rest string, err error) {
1477 t := s[1:] // chop [
1478 re := p.newRegexp(OpCharClass)
1480 re.Rune = re.Rune0[:0]
1483 if t != "" && t[0] == '^' {
1487 // If character class does not match \n, add it here,
1488 // so that negation later will do the right thing.
1489 if p.flags&ClassNL == 0 {
1490 re.Rune = append(re.Rune, '\n', '\n')
1495 first := true // ] and - are okay as first char in class
1496 for t == "" || t[0] != ']' || first {
1497 // POSIX: - is only okay unescaped as first or last in class.
1498 // Perl: - is okay anywhere.
1499 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1500 _, size := utf8.DecodeRuneInString(t[1:])
1501 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1505 // Look for POSIX [:alnum:] etc.
1506 if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1507 nclass, nt, err := p.parseNamedClass(t, class)
1512 class, t = nclass, nt
1517 // Look for Unicode character group like \p{Han}.
1518 nclass, nt, err := p.parseUnicodeClass(t, class)
1523 class, t = nclass, nt
1527 // Look for Perl character class symbols (extension).
1528 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1529 class, t = nclass, nt
1533 // Single character or simple range.
1536 if lo, t, err = p.parseClassChar(t, s); err != nil {
1540 // [a-] means (a|-) so check for final ].
1541 if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1543 if hi, t, err = p.parseClassChar(t, s); err != nil {
1547 rng = rng[:len(rng)-len(t)]
1548 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1551 if p.flags&FoldCase == 0 {
1552 class = AppendRange(class, lo, hi)
1554 class = appendFoldedRange(class, lo, hi)
1559 // Use &re.Rune instead of &class to avoid allocation.
1561 class = cleanClass(&re.Rune)
1563 class = negateClass(class)
1570 // cleanClass sorts the ranges (pairs of elements of r),
1571 // merges them, and eliminates duplicates.
1572 func cleanClass(rp *[]rune) []rune {
1574 // Sort by lo increasing, hi decreasing to break ties.
1575 sort.Sort(ranges{rp})
1582 // Merge abutting, overlapping.
1583 w := 2 // write index
1584 for i := 2; i < len(r); i += 2 {
1585 lo, hi := r[i], r[i+1]
1587 // merge with previous range
1593 // new disjoint range
1602 // appendLiteral returns the result of appending the literal x to the class r.
1603 func appendLiteral(r []rune, x rune, flags Flags) []rune {
1604 if flags&FoldCase != 0 {
1605 return appendFoldedRange(r, x, x)
1607 return AppendRange(r, x, x)
1610 // appendRange returns the result of appending the range lo-hi to the class r.
1611 func AppendRange(r []rune, lo, hi rune) []rune {
1612 // Expand last range or next to last range if it overlaps or abuts.
1613 // Checking two ranges helps when appending case-folded
1614 // alphabets, so that one range can be expanding A-Z and the
1615 // other expanding a-z.
1617 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1619 rlo, rhi := r[n-i], r[n-i+1]
1620 if lo <= rhi+1 && rlo <= hi+1 {
1632 return append(r, lo, hi)
1636 // minimum and maximum runes involved in folding.
1637 // checked during test.
1642 // appendFoldedRange returns the result of appending the range lo-hi
1643 // and its case folding-equivalent runes to the class r.
1644 func appendFoldedRange(r []rune, lo, hi rune) []rune {
1646 if lo <= MinFold && hi >= MaxFold {
1647 // Range is full: folding can't add more.
1648 return AppendRange(r, lo, hi)
1650 if hi < MinFold || lo > MaxFold {
1651 // Range is outside folding possibilities.
1652 return AppendRange(r, lo, hi)
1655 // [lo, MinFold-1] needs no folding.
1656 r = AppendRange(r, lo, MinFold-1)
1660 // [MaxFold+1, hi] needs no folding.
1661 r = AppendRange(r, MaxFold+1, hi)
1665 // Brute force. Depend on AppendRange to coalesce ranges on the fly.
1666 for c := lo; c <= hi; c++ {
1667 r = AppendRange(r, c, c)
1668 f := unicode.SimpleFold(c)
1670 r = AppendRange(r, f, f)
1671 f = unicode.SimpleFold(f)
1677 // appendClass returns the result of appending the class x to the class r.
1678 // It assume x is clean.
1679 func appendClass(r []rune, x []rune) []rune {
1680 for i := 0; i < len(x); i += 2 {
1681 r = AppendRange(r, x[i], x[i+1])
1686 // appendFolded returns the result of appending the case folding of the class x to the class r.
1687 func appendFoldedClass(r []rune, x []rune) []rune {
1688 for i := 0; i < len(x); i += 2 {
1689 r = appendFoldedRange(r, x[i], x[i+1])
1694 // appendNegatedClass returns the result of appending the negation of the class x to the class r.
1695 // It assumes x is clean.
1696 func appendNegatedClass(r []rune, x []rune) []rune {
1697 nextLo := rune('\u0000')
1698 for i := 0; i < len(x); i += 2 {
1699 lo, hi := x[i], x[i+1]
1701 r = AppendRange(r, nextLo, lo-1)
1705 if nextLo <= unicode.MaxRune {
1706 r = AppendRange(r, nextLo, unicode.MaxRune)
1711 // appendTable returns the result of appending x to the class r.
1712 func appendTable(r []rune, x *unicode.RangeTable) []rune {
1713 for _, xr := range x.R16 {
1714 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1716 r = AppendRange(r, lo, hi)
1719 for c := lo; c <= hi; c += stride {
1720 r = AppendRange(r, c, c)
1723 for _, xr := range x.R32 {
1724 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1726 r = AppendRange(r, lo, hi)
1729 for c := lo; c <= hi; c += stride {
1730 r = AppendRange(r, c, c)
1736 // appendNegatedTable returns the result of appending the negation of x to the class r.
1737 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1738 nextLo := rune('\u0000') // lo end of next class to add
1739 for _, xr := range x.R16 {
1740 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1743 r = AppendRange(r, nextLo, lo-1)
1748 for c := lo; c <= hi; c += stride {
1750 r = AppendRange(r, nextLo, c-1)
1755 for _, xr := range x.R32 {
1756 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1759 r = AppendRange(r, nextLo, lo-1)
1764 for c := lo; c <= hi; c += stride {
1766 r = AppendRange(r, nextLo, c-1)
1771 if nextLo <= unicode.MaxRune {
1772 r = AppendRange(r, nextLo, unicode.MaxRune)
1777 // negateClass overwrites r and returns r's negation.
1778 // It assumes the class r is already clean.
1779 func negateClass(r []rune) []rune {
1780 nextLo := rune('\u0000') // lo end of next class to add
1781 w := 0 // write index
1782 for i := 0; i < len(r); i += 2 {
1783 lo, hi := r[i], r[i+1]
1792 if nextLo <= unicode.MaxRune {
1793 // It's possible for the negation to have one more
1794 // range - this one - than the original class, so use append.
1795 r = append(r, nextLo, unicode.MaxRune)
1800 // ranges implements sort.Interface on a []rune.
1801 // The choice of receiver type definition is strange
1802 // but avoids an allocation since we already have
1804 type ranges struct {
1808 func (ra ranges) Less(i, j int) bool {
1812 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1815 func (ra ranges) Len() int {
1816 return len(*ra.p) / 2
1819 func (ra ranges) Swap(i, j int) {
1823 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1826 func checkUTF8(s string) error {
1828 rune, size := utf8.DecodeRuneInString(s)
1829 if rune == utf8.RuneError && size == 1 {
1830 return &Error{Code: ErrInvalidUTF8, Expr: s}
1837 func nextRune(s string) (c rune, t string, err error) {
1838 c, size := utf8.DecodeRuneInString(s)
1839 if c == utf8.RuneError && size == 1 {
1840 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1842 return c, s[size:], nil
1845 func isalnum(c rune) bool {
1846 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1849 func unhex(c rune) rune {
1850 if '0' <= c && c <= '9' {
1853 if 'a' <= c && c <= 'f' {
1856 if 'A' <= c && c <= 'F' {