libgo/go/regexp/syntax/parse.go

   1 // Copyright 2011 The Go Authors.  All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package syntax
   6
   7 import (
   8         "sort"
   9         "strings"
  10         "unicode"
  11         "utf8"
  12 )
  13
  14 // An Error describes a failure to parse a regular expression
  15 // and gives the offending expression.
  16 type Error struct {
  17         Code ErrorCode
  18         Expr string
  19 }
  20
  21 func (e *Error) Error() string {
  22         return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
  23 }
  24
  25 // An ErrorCode describes a failure to parse a regular expression.
  26 type ErrorCode string
  27
  28 const (
  29         // Unexpected error
  30         ErrInternalError ErrorCode = "regexp/syntax: internal error"
  31
  32         // Parse errors
  33         ErrInvalidCharClass      ErrorCode = "invalid character class"
  34         ErrInvalidCharRange      ErrorCode = "invalid character class range"
  35         ErrInvalidEscape         ErrorCode = "invalid escape sequence"
  36         ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
  37         ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
  38         ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
  39         ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
  40         ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
  41         ErrMissingBracket        ErrorCode = "missing closing ]"
  42         ErrMissingParen          ErrorCode = "missing closing )"
  43         ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
  44         ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
  45 )
  46
  47 func (e ErrorCode) String() string {
  48         return string(e)
  49 }
  50
  51 // Flags control the behavior of the parser and record information about regexp context.
  52 type Flags uint16
  53
  54 const (
  55         FoldCase      Flags = 1 << iota // case-insensitive match
  56         Literal                         // treat pattern as literal string
  57         ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
  58         DotNL                           // allow . to match newline
  59         OneLine                         // treat ^ and $ as only matching at beginning and end of text
  60         NonGreedy                       // make repetition operators default to non-greedy
  61         PerlX                           // allow Perl extensions
  62         UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
  63         WasDollar                       // regexp OpEndText was $, not \z
  64         Simple                          // regexp contains no counted repetition
  65
  66         MatchNL = ClassNL | DotNL
  67
  68         Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
  69         POSIX Flags = 0                                         // POSIX syntax
  70 )
  71
  72 // Pseudo-ops for parsing stack.
  73 const (
  74         opLeftParen = opPseudo + iota
  75         opVerticalBar
  76 )
  77
  78 type parser struct {
  79         flags       Flags     // parse mode flags
  80         stack       []*Regexp // stack of parsed expressions
  81         free        *Regexp
  82         numCap      int // number of capturing groups seen
  83         wholeRegexp string
  84         tmpClass    []rune // temporary char class work space
  85 }
  86
  87 func (p *parser) newRegexp(op Op) *Regexp {
  88         re := p.free
  89         if re != nil {
  90                 p.free = re.Sub0[0]
  91                 *re = Regexp{}
  92         } else {
  93                 re = new(Regexp)
  94         }
  95         re.Op = op
  96         return re
  97 }
  98
  99 func (p *parser) reuse(re *Regexp) {
 100         re.Sub0[0] = p.free
 101         p.free = re
 102 }
 103
 104 // Parse stack manipulation.
 105
 106 // push pushes the regexp re onto the parse stack and returns the regexp.
 107 func (p *parser) push(re *Regexp) *Regexp {
 108         if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
 109                 // Single rune.
 110                 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
 111                         return nil
 112                 }
 113                 re.Op = OpLiteral
 114                 re.Rune = re.Rune[:1]
 115                 re.Flags = p.flags &^ FoldCase
 116         } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
 117                 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
 118                 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
 119                 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
 120                 re.Op == OpCharClass && len(re.Rune) == 2 &&
 121                         re.Rune[0]+1 == re.Rune[1] &&
 122                         unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
 123                         unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
 124                 // Case-insensitive rune like [Aa] or [Δδ].
 125                 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
 126                         return nil
 127                 }
 128
 129                 // Rewrite as (case-insensitive) literal.
 130                 re.Op = OpLiteral
 131                 re.Rune = re.Rune[:1]
 132                 re.Flags = p.flags | FoldCase
 133         } else {
 134                 // Incremental concatenation.
 135                 p.maybeConcat(-1, 0)
 136         }
 137
 138         p.stack = append(p.stack, re)
 139         return re
 140 }
 141
 142 // maybeConcat implements incremental concatenation
 143 // of literal runes into string nodes.  The parser calls this
 144 // before each push, so only the top fragment of the stack
 145 // might need processing.  Since this is called before a push,
 146 // the topmost literal is no longer subject to operators like *
 147 // (Otherwise ab* would turn into (ab)*.)
 148 // If r >= 0 and there's a node left over, maybeConcat uses it
 149 // to push r with the given flags.
 150 // maybeConcat reports whether r was pushed.
 151 func (p *parser) maybeConcat(r rune, flags Flags) bool {
 152         n := len(p.stack)
 153         if n < 2 {
 154                 return false
 155         }
 156
 157         re1 := p.stack[n-1]
 158         re2 := p.stack[n-2]
 159         if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
 160                 return false
 161         }
 162
 163         // Push re1 into re2.
 164         re2.Rune = append(re2.Rune, re1.Rune...)
 165
 166         // Reuse re1 if possible.
 167         if r >= 0 {
 168                 re1.Rune = re1.Rune0[:1]
 169                 re1.Rune[0] = r
 170                 re1.Flags = flags
 171                 return true
 172         }
 173
 174         p.stack = p.stack[:n-1]
 175         p.reuse(re1)
 176         return false // did not push r
 177 }
 178
 179 // newLiteral returns a new OpLiteral Regexp with the given flags
 180 func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
 181         re := p.newRegexp(OpLiteral)
 182         re.Flags = flags
 183         if flags&FoldCase != 0 {
 184                 r = minFoldRune(r)
 185         }
 186         re.Rune0[0] = r
 187         re.Rune = re.Rune0[:1]
 188         return re
 189 }
 190
 191 // minFoldRune returns the minimum rune fold-equivalent to r.
 192 func minFoldRune(r rune) rune {
 193         if r < MinFold || r > MaxFold {
 194                 return r
 195         }
 196         min := r
 197         r0 := r
 198         for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
 199                 if min > r {
 200                         min = r
 201                 }
 202         }
 203         return min
 204 }
 205
 206 // literal pushes a literal regexp for the rune r on the stack
 207 // and returns that regexp.
 208 func (p *parser) literal(r rune) {
 209         p.push(p.newLiteral(r, p.flags))
 210 }
 211
 212 // op pushes a regexp with the given op onto the stack
 213 // and returns that regexp.
 214 func (p *parser) op(op Op) *Regexp {
 215         re := p.newRegexp(op)
 216         re.Flags = p.flags
 217         return p.push(re)
 218 }
 219
 220 // repeat replaces the top stack element with itself repeated according to op, min, max.
 221 // before is the regexp suffix starting at the repetition operator.
 222 // after is the regexp suffix following after the repetition operator.
 223 // repeat returns an updated 'after' and an error, if any.
 224 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
 225         flags := p.flags
 226         if p.flags&PerlX != 0 {
 227                 if len(after) > 0 && after[0] == '?' {
 228                         after = after[1:]
 229                         flags ^= NonGreedy
 230                 }
 231                 if lastRepeat != "" {
 232                         // In Perl it is not allowed to stack repetition operators:
 233                         // a** is a syntax error, not a doubled star, and a++ means
 234                         // something else entirely, which we don't support!
 235                         return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
 236                 }
 237         }
 238         n := len(p.stack)
 239         if n == 0 {
 240                 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
 241         }
 242         sub := p.stack[n-1]
 243         if sub.Op >= opPseudo {
 244                 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
 245         }
 246         re := p.newRegexp(op)
 247         re.Min = min
 248         re.Max = max
 249         re.Flags = flags
 250         re.Sub = re.Sub0[:1]
 251         re.Sub[0] = sub
 252         p.stack[n-1] = re
 253         return after, nil
 254 }
 255
 256 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
 257 func (p *parser) concat() *Regexp {
 258         p.maybeConcat(-1, 0)
 259
 260         // Scan down to find pseudo-operator | or (.
 261         i := len(p.stack)
 262         for i > 0 && p.stack[i-1].Op < opPseudo {
 263                 i--
 264         }
 265         subs := p.stack[i:]
 266         p.stack = p.stack[:i]
 267
 268         // Empty concatenation is special case.
 269         if len(subs) == 0 {
 270                 return p.push(p.newRegexp(OpEmptyMatch))
 271         }
 272
 273         return p.push(p.collapse(subs, OpConcat))
 274 }
 275
 276 // alternate replaces the top of the stack (above the topmost '(') with its alternation.
 277 func (p *parser) alternate() *Regexp {
 278         // Scan down to find pseudo-operator (.
 279         // There are no | above (.
 280         i := len(p.stack)
 281         for i > 0 && p.stack[i-1].Op < opPseudo {
 282                 i--
 283         }
 284         subs := p.stack[i:]
 285         p.stack = p.stack[:i]
 286
 287         // Make sure top class is clean.
 288         // All the others already are (see swapVerticalBar).
 289         if len(subs) > 0 {
 290                 cleanAlt(subs[len(subs)-1])
 291         }
 292
 293         // Empty alternate is special case
 294         // (shouldn't happen but easy to handle).
 295         if len(subs) == 0 {
 296                 return p.push(p.newRegexp(OpNoMatch))
 297         }
 298
 299         return p.push(p.collapse(subs, OpAlternate))
 300 }
 301
 302 // cleanAlt cleans re for eventual inclusion in an alternation.
 303 func cleanAlt(re *Regexp) {
 304         switch re.Op {
 305         case OpCharClass:
 306                 re.Rune = cleanClass(&re.Rune)
 307                 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
 308                         re.Rune = nil
 309                         re.Op = OpAnyChar
 310                         return
 311                 }
 312                 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
 313                         re.Rune = nil
 314                         re.Op = OpAnyCharNotNL
 315                         return
 316                 }
 317                 if cap(re.Rune)-len(re.Rune) > 100 {
 318                         // re.Rune will not grow any more.
 319                         // Make a copy or inline to reclaim storage.
 320                         re.Rune = append(re.Rune0[:0], re.Rune...)
 321                 }
 322         }
 323 }
 324
 325 // collapse returns the result of applying op to sub.
 326 // If sub contains op nodes, they all get hoisted up
 327 // so that there is never a concat of a concat or an
 328 // alternate of an alternate.
 329 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
 330         if len(subs) == 1 {
 331                 return subs[0]
 332         }
 333         re := p.newRegexp(op)
 334         re.Sub = re.Sub0[:0]
 335         for _, sub := range subs {
 336                 if sub.Op == op {
 337                         re.Sub = append(re.Sub, sub.Sub...)
 338                         p.reuse(sub)
 339                 } else {
 340                         re.Sub = append(re.Sub, sub)
 341                 }
 342         }
 343         if op == OpAlternate {
 344                 re.Sub = p.factor(re.Sub, re.Flags)
 345                 if len(re.Sub) == 1 {
 346                         old := re
 347                         re = re.Sub[0]
 348                         p.reuse(old)
 349                 }
 350         }
 351         return re
 352 }
 353
 354 // factor factors common prefixes from the alternation list sub.
 355 // It returns a replacement list that reuses the same storage and
 356 // frees (passes to p.reuse) any removed *Regexps.
 357 //
 358 // For example,
 359 //     ABC|ABD|AEF|BCX|BCY
 360 // simplifies by literal prefix extraction to
 361 //     A(B(C|D)|EF)|BC(X|Y)
 362 // which simplifies by character class introduction to
 363 //     A(B[CD]|EF)|BC[XY]
 364 //
 365 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
 366         if len(sub) < 2 {
 367                 return sub
 368         }
 369
 370         // Round 1: Factor out common literal prefixes.
 371         var str []rune
 372         var strflags Flags
 373         start := 0
 374         out := sub[:0]
 375         for i := 0; i <= len(sub); i++ {
 376                 // Invariant: the Regexps that were in sub[0:start] have been
 377                 // used or marked for reuse, and the slice space has been reused
 378                 // for out (len(out) <= start).
 379                 //
 380                 // Invariant: sub[start:i] consists of regexps that all begin
 381                 // with str as modified by strflags.
 382                 var istr []rune
 383                 var iflags Flags
 384                 if i < len(sub) {
 385                         istr, iflags = p.leadingString(sub[i])
 386                         if iflags == strflags {
 387                                 same := 0
 388                                 for same < len(str) && same < len(istr) && str[same] == istr[same] {
 389                                         same++
 390                                 }
 391                                 if same > 0 {
 392                                         // Matches at least one rune in current range.
 393                                         // Keep going around.
 394                                         str = str[:same]
 395                                         continue
 396                                 }
 397                         }
 398                 }
 399
 400                 // Found end of a run with common leading literal string:
 401                 // sub[start:i] all begin with str[0:len(str)], but sub[i]
 402                 // does not even begin with str[0].
 403                 //
 404                 // Factor out common string and append factored expression to out.
 405                 if i == start {
 406                         // Nothing to do - run of length 0.
 407                 } else if i == start+1 {
 408                         // Just one: don't bother factoring.
 409                         out = append(out, sub[start])
 410                 } else {
 411                         // Construct factored form: prefix(suffix1|suffix2|...)
 412                         prefix := p.newRegexp(OpLiteral)
 413                         prefix.Flags = strflags
 414                         prefix.Rune = append(prefix.Rune[:0], str...)
 415
 416                         for j := start; j < i; j++ {
 417                                 sub[j] = p.removeLeadingString(sub[j], len(str))
 418                         }
 419                         suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 420
 421                         re := p.newRegexp(OpConcat)
 422                         re.Sub = append(re.Sub[:0], prefix, suffix)
 423                         out = append(out, re)
 424                 }
 425
 426                 // Prepare for next iteration.
 427                 start = i
 428                 str = istr
 429                 strflags = iflags
 430         }
 431         sub = out
 432
 433         // Round 2: Factor out common complex prefixes,
 434         // just the first piece of each concatenation,
 435         // whatever it is.  This is good enough a lot of the time.
 436         start = 0
 437         out = sub[:0]
 438         var first *Regexp
 439         for i := 0; i <= len(sub); i++ {
 440                 // Invariant: the Regexps that were in sub[0:start] have been
 441                 // used or marked for reuse, and the slice space has been reused
 442                 // for out (len(out) <= start).
 443                 //
 444                 // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
 445                 var ifirst *Regexp
 446                 if i < len(sub) {
 447                         ifirst = p.leadingRegexp(sub[i])
 448                         if first != nil && first.Equal(ifirst) {
 449                                 continue
 450                         }
 451                 }
 452
 453                 // Found end of a run with common leading regexp:
 454                 // sub[start:i] all begin with first but sub[i] does not.
 455                 //
 456                 // Factor out common regexp and append factored expression to out.
 457                 if i == start {
 458                         // Nothing to do - run of length 0.
 459                 } else if i == start+1 {
 460                         // Just one: don't bother factoring.
 461                         out = append(out, sub[start])
 462                 } else {
 463                         // Construct factored form: prefix(suffix1|suffix2|...)
 464                         prefix := first
 465                         for j := start; j < i; j++ {
 466                                 reuse := j != start // prefix came from sub[start]
 467                                 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
 468                         }
 469                         suffix := p.collapse(sub[start:i], OpAlternate) // recurse
 470
 471                         re := p.newRegexp(OpConcat)
 472                         re.Sub = append(re.Sub[:0], prefix, suffix)
 473                         out = append(out, re)
 474                 }
 475
 476                 // Prepare for next iteration.
 477                 start = i
 478                 first = ifirst
 479         }
 480         sub = out
 481
 482         // Round 3: Collapse runs of single literals into character classes.
 483         start = 0
 484         out = sub[:0]
 485         for i := 0; i <= len(sub); i++ {
 486                 // Invariant: the Regexps that were in sub[0:start] have been
 487                 // used or marked for reuse, and the slice space has been reused
 488                 // for out (len(out) <= start).
 489                 //
 490                 // Invariant: sub[start:i] consists of regexps that are either
 491                 // literal runes or character classes.
 492                 if i < len(sub) && isCharClass(sub[i]) {
 493                         continue
 494                 }
 495
 496                 // sub[i] is not a char or char class;
 497                 // emit char class for sub[start:i]...
 498                 if i == start {
 499                         // Nothing to do - run of length 0.
 500                 } else if i == start+1 {
 501                         out = append(out, sub[start])
 502                 } else {
 503                         // Make new char class.
 504                         // Start with most complex regexp in sub[start].
 505                         max := start
 506                         for j := start + 1; j < i; j++ {
 507                                 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
 508                                         max = j
 509                                 }
 510                         }
 511                         sub[start], sub[max] = sub[max], sub[start]
 512
 513                         for j := start + 1; j < i; j++ {
 514                                 mergeCharClass(sub[start], sub[j])
 515                                 p.reuse(sub[j])
 516                         }
 517                         cleanAlt(sub[start])
 518                         out = append(out, sub[start])
 519                 }
 520
 521                 // ... and then emit sub[i].
 522                 if i < len(sub) {
 523                         out = append(out, sub[i])
 524                 }
 525                 start = i + 1
 526         }
 527         sub = out
 528
 529         // Round 4: Collapse runs of empty matches into a single empty match.
 530         start = 0
 531         out = sub[:0]
 532         for i := range sub {
 533                 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
 534                         continue
 535                 }
 536                 out = append(out, sub[i])
 537         }
 538         sub = out
 539
 540         return sub
 541 }
 542
 543 // leadingString returns the leading literal string that re begins with.
 544 // The string refers to storage in re or its children.
 545 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
 546         if re.Op == OpConcat && len(re.Sub) > 0 {
 547                 re = re.Sub[0]
 548         }
 549         if re.Op != OpLiteral {
 550                 return nil, 0
 551         }
 552         return re.Rune, re.Flags & FoldCase
 553 }
 554
 555 // removeLeadingString removes the first n leading runes
 556 // from the beginning of re.  It returns the replacement for re.
 557 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
 558         if re.Op == OpConcat && len(re.Sub) > 0 {
 559                 // Removing a leading string in a concatenation
 560                 // might simplify the concatenation.
 561                 sub := re.Sub[0]
 562                 sub = p.removeLeadingString(sub, n)
 563                 re.Sub[0] = sub
 564                 if sub.Op == OpEmptyMatch {
 565                         p.reuse(sub)
 566                         switch len(re.Sub) {
 567                         case 0, 1:
 568                                 // Impossible but handle.
 569                                 re.Op = OpEmptyMatch
 570                                 re.Sub = nil
 571                         case 2:
 572                                 old := re
 573                                 re = re.Sub[1]
 574                                 p.reuse(old)
 575                         default:
 576                                 copy(re.Sub, re.Sub[1:])
 577                                 re.Sub = re.Sub[:len(re.Sub)-1]
 578                         }
 579                 }
 580                 return re
 581         }
 582
 583         if re.Op == OpLiteral {
 584                 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
 585                 if len(re.Rune) == 0 {
 586                         re.Op = OpEmptyMatch
 587                 }
 588         }
 589         return re
 590 }
 591
 592 // leadingRegexp returns the leading regexp that re begins with.
 593 // The regexp refers to storage in re or its children.
 594 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
 595         if re.Op == OpEmptyMatch {
 596                 return nil
 597         }
 598         if re.Op == OpConcat && len(re.Sub) > 0 {
 599                 sub := re.Sub[0]
 600                 if sub.Op == OpEmptyMatch {
 601                         return nil
 602                 }
 603                 return sub
 604         }
 605         return re
 606 }
 607
 608 // removeLeadingRegexp removes the leading regexp in re.
 609 // It returns the replacement for re.
 610 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
 611 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
 612         if re.Op == OpConcat && len(re.Sub) > 0 {
 613                 if reuse {
 614                         p.reuse(re.Sub[0])
 615                 }
 616                 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
 617                 switch len(re.Sub) {
 618                 case 0:
 619                         re.Op = OpEmptyMatch
 620                         re.Sub = nil
 621                 case 1:
 622                         old := re
 623                         re = re.Sub[0]
 624                         p.reuse(old)
 625                 }
 626                 return re
 627         }
 628         if reuse {
 629                 p.reuse(re)
 630         }
 631         return p.newRegexp(OpEmptyMatch)
 632 }
 633
 634 func literalRegexp(s string, flags Flags) *Regexp {
 635         re := &Regexp{Op: OpLiteral}
 636         re.Flags = flags
 637         re.Rune = re.Rune0[:0] // use local storage for small strings
 638         for _, c := range s {
 639                 if len(re.Rune) >= cap(re.Rune) {
 640                         // string is too long to fit in Rune0.  let Go handle it
 641                         re.Rune = []rune(s)
 642                         break
 643                 }
 644                 re.Rune = append(re.Rune, c)
 645         }
 646         return re
 647 }
 648
 649 // Parsing.
 650
 651 func Parse(s string, flags Flags) (*Regexp, error) {
 652         if flags&Literal != 0 {
 653                 // Trivial parser for literal string.
 654                 if err := checkUTF8(s); err != nil {
 655                         return nil, err
 656                 }
 657                 return literalRegexp(s, flags), nil
 658         }
 659
 660         // Otherwise, must do real work.
 661         var (
 662                 p          parser
 663                 err        error
 664                 c          rune
 665                 op         Op
 666                 lastRepeat string
 667                 min, max   int
 668         )
 669         p.flags = flags
 670         p.wholeRegexp = s
 671         t := s
 672         for t != "" {
 673                 repeat := ""
 674         BigSwitch:
 675                 switch t[0] {
 676                 default:
 677                         if c, t, err = nextRune(t); err != nil {
 678                                 return nil, err
 679                         }
 680                         p.literal(c)
 681
 682                 case '(':
 683                         if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
 684                                 // Flag changes and non-capturing groups.
 685                                 if t, err = p.parsePerlFlags(t); err != nil {
 686                                         return nil, err
 687                                 }
 688                                 break
 689                         }
 690                         p.numCap++
 691                         p.op(opLeftParen).Cap = p.numCap
 692                         t = t[1:]
 693                 case '|':
 694                         if err = p.parseVerticalBar(); err != nil {
 695                                 return nil, err
 696                         }
 697                         t = t[1:]
 698                 case ')':
 699                         if err = p.parseRightParen(); err != nil {
 700                                 return nil, err
 701                         }
 702                         t = t[1:]
 703                 case '^':
 704                         if p.flags&OneLine != 0 {
 705                                 p.op(OpBeginText)
 706                         } else {
 707                                 p.op(OpBeginLine)
 708                         }
 709                         t = t[1:]
 710                 case '$':
 711                         if p.flags&OneLine != 0 {
 712                                 p.op(OpEndText).Flags |= WasDollar
 713                         } else {
 714                                 p.op(OpEndLine)
 715                         }
 716                         t = t[1:]
 717                 case '.':
 718                         if p.flags&DotNL != 0 {
 719                                 p.op(OpAnyChar)
 720                         } else {
 721                                 p.op(OpAnyCharNotNL)
 722                         }
 723                         t = t[1:]
 724                 case '[':
 725                         if t, err = p.parseClass(t); err != nil {
 726                                 return nil, err
 727                         }
 728                 case '*', '+', '?':
 729                         before := t
 730                         switch t[0] {
 731                         case '*':
 732                                 op = OpStar
 733                         case '+':
 734                                 op = OpPlus
 735                         case '?':
 736                                 op = OpQuest
 737                         }
 738                         after := t[1:]
 739                         if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
 740                                 return nil, err
 741                         }
 742                         repeat = before
 743                         t = after
 744                 case '{':
 745                         op = OpRepeat
 746                         before := t
 747                         min, max, after, ok := p.parseRepeat(t)
 748                         if !ok {
 749                                 // If the repeat cannot be parsed, { is a literal.
 750                                 p.literal('{')
 751                                 t = t[1:]
 752                                 break
 753                         }
 754                         if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
 755                                 // Numbers were too big, or max is present and min > max.
 756                                 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
 757                         }
 758                         if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
 759                                 return nil, err
 760                         }
 761                         repeat = before
 762                         t = after
 763                 case '\\':
 764                         if p.flags&PerlX != 0 && len(t) >= 2 {
 765                                 switch t[1] {
 766                                 case 'A':
 767                                         p.op(OpBeginText)
 768                                         t = t[2:]
 769                                         break BigSwitch
 770                                 case 'b':
 771                                         p.op(OpWordBoundary)
 772                                         t = t[2:]
 773                                         break BigSwitch
 774                                 case 'B':
 775                                         p.op(OpNoWordBoundary)
 776                                         t = t[2:]
 777                                         break BigSwitch
 778                                 case 'C':
 779                                         // any byte; not supported
 780                                         return nil, &Error{ErrInvalidEscape, t[:2]}
 781                                 case 'Q':
 782                                         // \Q ... \E: the ... is always literals
 783                                         var lit string
 784                                         if i := strings.Index(t, `\E`); i < 0 {
 785                                                 lit = t[2:]
 786                                                 t = ""
 787                                         } else {
 788                                                 lit = t[2:i]
 789                                                 t = t[i+2:]
 790                                         }
 791                                         p.push(literalRegexp(lit, p.flags))
 792                                         break BigSwitch
 793                                 case 'z':
 794                                         p.op(OpEndText)
 795                                         t = t[2:]
 796                                         break BigSwitch
 797                                 }
 798                         }
 799
 800                         re := p.newRegexp(OpCharClass)
 801                         re.Flags = p.flags
 802
 803                         // Look for Unicode character group like \p{Han}
 804                         if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
 805                                 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
 806                                 if err != nil {
 807                                         return nil, err
 808                                 }
 809                                 if r != nil {
 810                                         re.Rune = r
 811                                         t = rest
 812                                         p.push(re)
 813                                         break BigSwitch
 814                                 }
 815                         }
 816
 817                         // Perl character class escape.
 818                         if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
 819                                 re.Rune = r
 820                                 t = rest
 821                                 p.push(re)
 822                                 break BigSwitch
 823                         }
 824                         p.reuse(re)
 825
 826                         // Ordinary single-character escape.
 827                         if c, t, err = p.parseEscape(t); err != nil {
 828                                 return nil, err
 829                         }
 830                         p.literal(c)
 831                 }
 832                 lastRepeat = repeat
 833         }
 834
 835         p.concat()
 836         if p.swapVerticalBar() {
 837                 // pop vertical bar
 838                 p.stack = p.stack[:len(p.stack)-1]
 839         }
 840         p.alternate()
 841
 842         n := len(p.stack)
 843         if n != 1 {
 844                 return nil, &Error{ErrMissingParen, s}
 845         }
 846         return p.stack[0], nil
 847 }
 848
 849 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
 850 // If s is not of that form, it returns ok == false.
 851 // If s has the right form but the values are too big, it returns min == -1, ok == true.
 852 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
 853         if s == "" || s[0] != '{' {
 854                 return
 855         }
 856         s = s[1:]
 857         var ok1 bool
 858         if min, s, ok1 = p.parseInt(s); !ok1 {
 859                 return
 860         }
 861         if s == "" {
 862                 return
 863         }
 864         if s[0] != ',' {
 865                 max = min
 866         } else {
 867                 s = s[1:]
 868                 if s == "" {
 869                         return
 870                 }
 871                 if s[0] == '}' {
 872                         max = -1
 873                 } else if max, s, ok1 = p.parseInt(s); !ok1 {
 874                         return
 875                 } else if max < 0 {
 876                         // parseInt found too big a number
 877                         min = -1
 878                 }
 879         }
 880         if s == "" || s[0] != '}' {
 881                 return
 882         }
 883         rest = s[1:]
 884         ok = true
 885         return
 886 }
 887
 888 // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
 889 // like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
 890 // The caller must have ensured that s begins with "(?".
 891 func (p *parser) parsePerlFlags(s string) (rest string, err error) {
 892         t := s
 893
 894         // Check for named captures, first introduced in Python's regexp library.
 895         // As usual, there are three slightly different syntaxes:
 896         //
 897         //   (?P<name>expr)   the original, introduced by Python
 898         //   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
 899         //   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
 900         //
 901         // Perl 5.10 gave in and implemented the Python version too,
 902         // but they claim that the last two are the preferred forms.
 903         // PCRE and languages based on it (specifically, PHP and Ruby)
 904         // support all three as well.  EcmaScript 4 uses only the Python form.
 905         //
 906         // In both the open source world (via Code Search) and the
 907         // Google source tree, (?P<expr>name) is the dominant form,
 908         // so that's the one we implement.  One is enough.
 909         if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
 910                 // Pull out name.
 911                 end := strings.IndexRune(t, '>')
 912                 if end < 0 {
 913                         if err = checkUTF8(t); err != nil {
 914                                 return "", err
 915                         }
 916                         return "", &Error{ErrInvalidNamedCapture, s}
 917                 }
 918
 919                 capture := t[:end+1] // "(?P<name>"
 920                 name := t[4:end]     // "name"
 921                 if err = checkUTF8(name); err != nil {
 922                         return "", err
 923                 }
 924                 if !isValidCaptureName(name) {
 925                         return "", &Error{ErrInvalidNamedCapture, capture}
 926                 }
 927
 928                 // Like ordinary capture, but named.
 929                 p.numCap++
 930                 re := p.op(opLeftParen)
 931                 re.Cap = p.numCap
 932                 re.Name = name
 933                 return t[end+1:], nil
 934         }
 935
 936         // Non-capturing group.  Might also twiddle Perl flags.
 937         var c rune
 938         t = t[2:] // skip (?
 939         flags := p.flags
 940         sign := +1
 941         sawFlag := false
 942 Loop:
 943         for t != "" {
 944                 if c, t, err = nextRune(t); err != nil {
 945                         return "", err
 946                 }
 947                 switch c {
 948                 default:
 949                         break Loop
 950
 951                 // Flags.
 952                 case 'i':
 953                         flags |= FoldCase
 954                         sawFlag = true
 955                 case 'm':
 956                         flags &^= OneLine
 957                         sawFlag = true
 958                 case 's':
 959                         flags |= DotNL
 960                         sawFlag = true
 961                 case 'U':
 962                         flags |= NonGreedy
 963                         sawFlag = true
 964
 965                 // Switch to negation.
 966                 case '-':
 967                         if sign < 0 {
 968                                 break Loop
 969                         }
 970                         sign = -1
 971                         // Invert flags so that | above turn into &^ and vice versa.
 972                         // We'll invert flags again before using it below.
 973                         flags = ^flags
 974                         sawFlag = false
 975
 976                 // End of flags, starting group or not.
 977                 case ':', ')':
 978                         if sign < 0 {
 979                                 if !sawFlag {
 980                                         break Loop
 981                                 }
 982                                 flags = ^flags
 983                         }
 984                         if c == ':' {
 985                                 // Open new group
 986                                 p.op(opLeftParen)
 987                         }
 988                         p.flags = flags
 989                         return t, nil
 990                 }
 991         }
 992
 993         return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
 994 }
 995
 996 // isValidCaptureName reports whether name
 997 // is a valid capture name: [A-Za-z0-9_]+.
 998 // PCRE limits names to 32 bytes.
 999 // Python rejects names starting with digits.
1000 // We don't enforce either of those.
1001 func isValidCaptureName(name string) bool {
1002         if name == "" {
1003                 return false
1004         }
1005         for _, c := range name {
1006                 if c != '_' && !isalnum(c) {
1007                         return false
1008                 }
1009         }
1010         return true
1011 }
1012
1013 // parseInt parses a decimal integer.
1014 func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1015         if s == "" || s[0] < '0' || '9' < s[0] {
1016                 return
1017         }
1018         // Disallow leading zeros.
1019         if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1020                 return
1021         }
1022         t := s
1023         for s != "" && '0' <= s[0] && s[0] <= '9' {
1024                 s = s[1:]
1025         }
1026         rest = s
1027         ok = true
1028         // Have digits, compute value.
1029         t = t[:len(t)-len(s)]
1030         for i := 0; i < len(t); i++ {
1031                 // Avoid overflow.
1032                 if n >= 1e8 {
1033                         n = -1
1034                         break
1035                 }
1036                 n = n*10 + int(t[i]) - '0'
1037         }
1038         return
1039 }
1040
1041 // can this be represented as a character class?
1042 // single-rune literal string, char class, ., and .|\n.
1043 func isCharClass(re *Regexp) bool {
1044         return re.Op == OpLiteral && len(re.Rune) == 1 ||
1045                 re.Op == OpCharClass ||
1046                 re.Op == OpAnyCharNotNL ||
1047                 re.Op == OpAnyChar
1048 }
1049
1050 // does re match r?
1051 func matchRune(re *Regexp, r rune) bool {
1052         switch re.Op {
1053         case OpLiteral:
1054                 return len(re.Rune) == 1 && re.Rune[0] == r
1055         case OpCharClass:
1056                 for i := 0; i < len(re.Rune); i += 2 {
1057                         if re.Rune[i] <= r && r <= re.Rune[i+1] {
1058                                 return true
1059                         }
1060                 }
1061                 return false
1062         case OpAnyCharNotNL:
1063                 return r != '\n'
1064         case OpAnyChar:
1065                 return true
1066         }
1067         return false
1068 }
1069
1070 // parseVerticalBar handles a | in the input.
1071 func (p *parser) parseVerticalBar() error {
1072         p.concat()
1073
1074         // The concatenation we just parsed is on top of the stack.
1075         // If it sits above an opVerticalBar, swap it below
1076         // (things below an opVerticalBar become an alternation).
1077         // Otherwise, push a new vertical bar.
1078         if !p.swapVerticalBar() {
1079                 p.op(opVerticalBar)
1080         }
1081
1082         return nil
1083 }
1084
1085 // mergeCharClass makes dst = dst|src.
1086 // The caller must ensure that dst.Op >= src.Op,
1087 // to reduce the amount of copying.
1088 func mergeCharClass(dst, src *Regexp) {
1089         switch dst.Op {
1090         case OpAnyChar:
1091                 // src doesn't add anything.
1092         case OpAnyCharNotNL:
1093                 // src might add \n
1094                 if matchRune(src, '\n') {
1095                         dst.Op = OpAnyChar
1096                 }
1097         case OpCharClass:
1098                 // src is simpler, so either literal or char class
1099                 if src.Op == OpLiteral {
1100                         dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1101                 } else {
1102                         dst.Rune = appendClass(dst.Rune, src.Rune)
1103                 }
1104         case OpLiteral:
1105                 // both literal
1106                 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1107                         break
1108                 }
1109                 dst.Op = OpCharClass
1110                 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1111                 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1112         }
1113 }
1114
1115 // If the top of the stack is an element followed by an opVerticalBar
1116 // swapVerticalBar swaps the two and returns true.
1117 // Otherwise it returns false.
1118 func (p *parser) swapVerticalBar() bool {
1119         // If above and below vertical bar are literal or char class,
1120         // can merge into a single char class.
1121         n := len(p.stack)
1122         if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1123                 re1 := p.stack[n-1]
1124                 re3 := p.stack[n-3]
1125                 // Make re3 the more complex of the two.
1126                 if re1.Op > re3.Op {
1127                         re1, re3 = re3, re1
1128                         p.stack[n-3] = re3
1129                 }
1130                 mergeCharClass(re3, re1)
1131                 p.reuse(re1)
1132                 p.stack = p.stack[:n-1]
1133                 return true
1134         }
1135
1136         if n >= 2 {
1137                 re1 := p.stack[n-1]
1138                 re2 := p.stack[n-2]
1139                 if re2.Op == opVerticalBar {
1140                         if n >= 3 {
1141                                 // Now out of reach.
1142                                 // Clean opportunistically.
1143                                 cleanAlt(p.stack[n-3])
1144                         }
1145                         p.stack[n-2] = re1
1146                         p.stack[n-1] = re2
1147                         return true
1148                 }
1149         }
1150         return false
1151 }
1152
1153 // parseRightParen handles a ) in the input.
1154 func (p *parser) parseRightParen() error {
1155         p.concat()
1156         if p.swapVerticalBar() {
1157                 // pop vertical bar
1158                 p.stack = p.stack[:len(p.stack)-1]
1159         }
1160         p.alternate()
1161
1162         n := len(p.stack)
1163         if n < 2 {
1164                 return &Error{ErrInternalError, ""}
1165         }
1166         re1 := p.stack[n-1]
1167         re2 := p.stack[n-2]
1168         p.stack = p.stack[:n-2]
1169         if re2.Op != opLeftParen {
1170                 return &Error{ErrMissingParen, p.wholeRegexp}
1171         }
1172         // Restore flags at time of paren.
1173         p.flags = re2.Flags
1174         if re2.Cap == 0 {
1175                 // Just for grouping.
1176                 p.push(re1)
1177         } else {
1178                 re2.Op = OpCapture
1179                 re2.Sub = re2.Sub0[:1]
1180                 re2.Sub[0] = re1
1181                 p.push(re2)
1182         }
1183         return nil
1184 }
1185
1186 // parseEscape parses an escape sequence at the beginning of s
1187 // and returns the rune.
1188 func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1189         t := s[1:]
1190         if t == "" {
1191                 return 0, "", &Error{ErrTrailingBackslash, ""}
1192         }
1193         c, t, err := nextRune(t)
1194         if err != nil {
1195                 return 0, "", err
1196         }
1197
1198 Switch:
1199         switch c {
1200         default:
1201                 if c < utf8.RuneSelf && !isalnum(c) {
1202                         // Escaped non-word characters are always themselves.
1203                         // PCRE is not quite so rigorous: it accepts things like
1204                         // \q, but we don't.  We once rejected \_, but too many
1205                         // programs and people insist on using it, so allow \_.
1206                         return c, t, nil
1207                 }
1208
1209         // Octal escapes.
1210         case '1', '2', '3', '4', '5', '6', '7':
1211                 // Single non-zero digit is a backreference; not supported
1212                 if t == "" || t[0] < '0' || t[0] > '7' {
1213                         break
1214                 }
1215                 fallthrough
1216         case '0':
1217                 // Consume up to three octal digits; already have one.
1218                 r = c - '0'
1219                 for i := 1; i < 3; i++ {
1220                         if t == "" || t[0] < '0' || t[0] > '7' {
1221                                 break
1222                         }
1223                         r = r*8 + rune(t[0]) - '0'
1224                         t = t[1:]
1225                 }
1226                 return r, t, nil
1227
1228         // Hexadecimal escapes.
1229         case 'x':
1230                 if t == "" {
1231                         break
1232                 }
1233                 if c, t, err = nextRune(t); err != nil {
1234                         return 0, "", err
1235                 }
1236                 if c == '{' {
1237                         // Any number of digits in braces.
1238                         // Perl accepts any text at all; it ignores all text
1239                         // after the first non-hex digit.  We require only hex digits,
1240                         // and at least one.
1241                         nhex := 0
1242                         r = 0
1243                         for {
1244                                 if t == "" {
1245                                         break Switch
1246                                 }
1247                                 if c, t, err = nextRune(t); err != nil {
1248                                         return 0, "", err
1249                                 }
1250                                 if c == '}' {
1251                                         break
1252                                 }
1253                                 v := unhex(c)
1254                                 if v < 0 {
1255                                         break Switch
1256                                 }
1257                                 r = r*16 + v
1258                                 if r > unicode.MaxRune {
1259                                         break Switch
1260                                 }
1261                                 nhex++
1262                         }
1263                         if nhex == 0 {
1264                                 break Switch
1265                         }
1266                         return r, t, nil
1267                 }
1268
1269                 // Easy case: two hex digits.
1270                 x := unhex(c)
1271                 if c, t, err = nextRune(t); err != nil {
1272                         return 0, "", err
1273                 }
1274                 y := unhex(c)
1275                 if x < 0 || y < 0 {
1276                         break
1277                 }
1278                 return x*16 + y, t, nil
1279
1280         // C escapes.  There is no case 'b', to avoid misparsing
1281         // the Perl word-boundary \b as the C backspace \b
1282         // when in POSIX mode.  In Perl, /\b/ means word-boundary
1283         // but /[\b]/ means backspace.  We don't support that.
1284         // If you want a backspace, embed a literal backspace
1285         // character or use \x08.
1286         case 'a':
1287                 return '\a', t, err
1288         case 'f':
1289                 return '\f', t, err
1290         case 'n':
1291                 return '\n', t, err
1292         case 'r':
1293                 return '\r', t, err
1294         case 't':
1295                 return '\t', t, err
1296         case 'v':
1297                 return '\v', t, err
1298         }
1299         return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1300 }
1301
1302 // parseClassChar parses a character class character at the beginning of s
1303 // and returns it.
1304 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1305         if s == "" {
1306                 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1307         }
1308
1309         // Allow regular escape sequences even though
1310         // many need not be escaped in this context.
1311         if s[0] == '\\' {
1312                 return p.parseEscape(s)
1313         }
1314
1315         return nextRune(s)
1316 }
1317
1318 type charGroup struct {
1319         sign  int
1320         class []rune
1321 }
1322
1323 // parsePerlClassEscape parses a leading Perl character class escape like \d
1324 // from the beginning of s.  If one is present, it appends the characters to r
1325 // and returns the new slice r and the remainder of the string.
1326 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1327         if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1328                 return
1329         }
1330         g := perlGroup[s[0:2]]
1331         if g.sign == 0 {
1332                 return
1333         }
1334         return p.appendGroup(r, g), s[2:]
1335 }
1336
1337 // parseNamedClass parses a leading POSIX named character class like [:alnum:]
1338 // from the beginning of s.  If one is present, it appends the characters to r
1339 // and returns the new slice r and the remainder of the string.
1340 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1341         if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1342                 return
1343         }
1344
1345         i := strings.Index(s[2:], ":]")
1346         if i < 0 {
1347                 return
1348         }
1349         i += 2
1350         name, s := s[0:i+2], s[i+2:]
1351         g := posixGroup[name]
1352         if g.sign == 0 {
1353                 return nil, "", &Error{ErrInvalidCharRange, name}
1354         }
1355         return p.appendGroup(r, g), s, nil
1356 }
1357
1358 func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1359         if p.flags&FoldCase == 0 {
1360                 if g.sign < 0 {
1361                         r = appendNegatedClass(r, g.class)
1362                 } else {
1363                         r = appendClass(r, g.class)
1364                 }
1365         } else {
1366                 tmp := p.tmpClass[:0]
1367                 tmp = appendFoldedClass(tmp, g.class)
1368                 p.tmpClass = tmp
1369                 tmp = cleanClass(&p.tmpClass)
1370                 if g.sign < 0 {
1371                         r = appendNegatedClass(r, tmp)
1372                 } else {
1373                         r = appendClass(r, tmp)
1374                 }
1375         }
1376         return r
1377 }
1378
1379 var anyTable = &unicode.RangeTable{
1380         []unicode.Range16{{0, 1<<16 - 1, 1}},
1381         []unicode.Range32{{1 << 16, unicode.MaxRune, 1}},
1382 }
1383
1384 // unicodeTable returns the unicode.RangeTable identified by name
1385 // and the table of additional fold-equivalent code points.
1386 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1387         // Special case: "Any" means any.
1388         if name == "Any" {
1389                 return anyTable, anyTable
1390         }
1391         if t := unicode.Categories[name]; t != nil {
1392                 return t, unicode.FoldCategory[name]
1393         }
1394         if t := unicode.Scripts[name]; t != nil {
1395                 return t, unicode.FoldScript[name]
1396         }
1397         return nil, nil
1398 }
1399
1400 // parseUnicodeClass parses a leading Unicode character class like \p{Han}
1401 // from the beginning of s.  If one is present, it appends the characters to r
1402 // and returns the new slice r and the remainder of the string.
1403 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1404         if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1405                 return
1406         }
1407
1408         // Committed to parse or return error.
1409         sign := +1
1410         if s[1] == 'P' {
1411                 sign = -1
1412         }
1413         t := s[2:]
1414         c, t, err := nextRune(t)
1415         if err != nil {
1416                 return
1417         }
1418         var seq, name string
1419         if c != '{' {
1420                 // Single-letter name.
1421                 seq = s[:len(s)-len(t)]
1422                 name = seq[2:]
1423         } else {
1424                 // Name is in braces.
1425                 end := strings.IndexRune(s, '}')
1426                 if end < 0 {
1427                         if err = checkUTF8(s); err != nil {
1428                                 return
1429                         }
1430                         return nil, "", &Error{ErrInvalidCharRange, s}
1431                 }
1432                 seq, t = s[:end+1], s[end+1:]
1433                 name = s[3:end]
1434                 if err = checkUTF8(name); err != nil {
1435                         return
1436                 }
1437         }
1438
1439         // Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1440         if name != "" && name[0] == '^' {
1441                 sign = -sign
1442                 name = name[1:]
1443         }
1444
1445         tab, fold := unicodeTable(name)
1446         if tab == nil {
1447                 return nil, "", &Error{ErrInvalidCharRange, seq}
1448         }
1449
1450         if p.flags&FoldCase == 0 || fold == nil {
1451                 if sign > 0 {
1452                         r = appendTable(r, tab)
1453                 } else {
1454                         r = appendNegatedTable(r, tab)
1455                 }
1456         } else {
1457                 // Merge and clean tab and fold in a temporary buffer.
1458                 // This is necessary for the negative case and just tidy
1459                 // for the positive case.
1460                 tmp := p.tmpClass[:0]
1461                 tmp = appendTable(tmp, tab)
1462                 tmp = appendTable(tmp, fold)
1463                 p.tmpClass = tmp
1464                 tmp = cleanClass(&p.tmpClass)
1465                 if sign > 0 {
1466                         r = appendClass(r, tmp)
1467                 } else {
1468                         r = appendNegatedClass(r, tmp)
1469                 }
1470         }
1471         return r, t, nil
1472 }
1473
1474 // parseClass parses a character class at the beginning of s
1475 // and pushes it onto the parse stack.
1476 func (p *parser) parseClass(s string) (rest string, err error) {
1477         t := s[1:] // chop [
1478         re := p.newRegexp(OpCharClass)
1479         re.Flags = p.flags
1480         re.Rune = re.Rune0[:0]
1481
1482         sign := +1
1483         if t != "" && t[0] == '^' {
1484                 sign = -1
1485                 t = t[1:]
1486
1487                 // If character class does not match \n, add it here,
1488                 // so that negation later will do the right thing.
1489                 if p.flags&ClassNL == 0 {
1490                         re.Rune = append(re.Rune, '\n', '\n')
1491                 }
1492         }
1493
1494         class := re.Rune
1495         first := true // ] and - are okay as first char in class
1496         for t == "" || t[0] != ']' || first {
1497                 // POSIX: - is only okay unescaped as first or last in class.
1498                 // Perl: - is okay anywhere.
1499                 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1500                         _, size := utf8.DecodeRuneInString(t[1:])
1501                         return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1502                 }
1503                 first = false
1504
1505                 // Look for POSIX [:alnum:] etc.
1506                 if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1507                         nclass, nt, err := p.parseNamedClass(t, class)
1508                         if err != nil {
1509                                 return "", err
1510                         }
1511                         if nclass != nil {
1512                                 class, t = nclass, nt
1513                                 continue
1514                         }
1515                 }
1516
1517                 // Look for Unicode character group like \p{Han}.
1518                 nclass, nt, err := p.parseUnicodeClass(t, class)
1519                 if err != nil {
1520                         return "", err
1521                 }
1522                 if nclass != nil {
1523                         class, t = nclass, nt
1524                         continue
1525                 }
1526
1527                 // Look for Perl character class symbols (extension).
1528                 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1529                         class, t = nclass, nt
1530                         continue
1531                 }
1532
1533                 // Single character or simple range.
1534                 rng := t
1535                 var lo, hi rune
1536                 if lo, t, err = p.parseClassChar(t, s); err != nil {
1537                         return "", err
1538                 }
1539                 hi = lo
1540                 // [a-] means (a|-) so check for final ].
1541                 if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1542                         t = t[1:]
1543                         if hi, t, err = p.parseClassChar(t, s); err != nil {
1544                                 return "", err
1545                         }
1546                         if hi < lo {
1547                                 rng = rng[:len(rng)-len(t)]
1548                                 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1549                         }
1550                 }
1551                 if p.flags&FoldCase == 0 {
1552                         class = AppendRange(class, lo, hi)
1553                 } else {
1554                         class = appendFoldedRange(class, lo, hi)
1555                 }
1556         }
1557         t = t[1:] // chop ]
1558
1559         // Use &re.Rune instead of &class to avoid allocation.
1560         re.Rune = class
1561         class = cleanClass(&re.Rune)
1562         if sign < 0 {
1563                 class = negateClass(class)
1564         }
1565         re.Rune = class
1566         p.push(re)
1567         return t, nil
1568 }
1569
1570 // cleanClass sorts the ranges (pairs of elements of r),
1571 // merges them, and eliminates duplicates.
1572 func cleanClass(rp *[]rune) []rune {
1573
1574         // Sort by lo increasing, hi decreasing to break ties.
1575         sort.Sort(ranges{rp})
1576
1577         r := *rp
1578         if len(r) < 2 {
1579                 return r
1580         }
1581
1582         // Merge abutting, overlapping.
1583         w := 2 // write index
1584         for i := 2; i < len(r); i += 2 {
1585                 lo, hi := r[i], r[i+1]
1586                 if lo <= r[w-1]+1 {
1587                         // merge with previous range
1588                         if hi > r[w-1] {
1589                                 r[w-1] = hi
1590                         }
1591                         continue
1592                 }
1593                 // new disjoint range
1594                 r[w] = lo
1595                 r[w+1] = hi
1596                 w += 2
1597         }
1598
1599         return r[:w]
1600 }
1601
1602 // appendLiteral returns the result of appending the literal x to the class r.
1603 func appendLiteral(r []rune, x rune, flags Flags) []rune {
1604         if flags&FoldCase != 0 {
1605                 return appendFoldedRange(r, x, x)
1606         }
1607         return AppendRange(r, x, x)
1608 }
1609
1610 // appendRange returns the result of appending the range lo-hi to the class r.
1611 func AppendRange(r []rune, lo, hi rune) []rune {
1612         // Expand last range or next to last range if it overlaps or abuts.
1613         // Checking two ranges helps when appending case-folded
1614         // alphabets, so that one range can be expanding A-Z and the
1615         // other expanding a-z.
1616         n := len(r)
1617         for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1618                 if n >= i {
1619                         rlo, rhi := r[n-i], r[n-i+1]
1620                         if lo <= rhi+1 && rlo <= hi+1 {
1621                                 if lo < rlo {
1622                                         r[n-i] = lo
1623                                 }
1624                                 if hi > rhi {
1625                                         r[n-i+1] = hi
1626                                 }
1627                                 return r
1628                         }
1629                 }
1630         }
1631
1632         return append(r, lo, hi)
1633 }
1634
1635 const (
1636         // minimum and maximum runes involved in folding.
1637         // checked during test.
1638         MinFold = 0x0041
1639         MaxFold = 0x1044f
1640 )
1641
1642 // appendFoldedRange returns the result of appending the range lo-hi
1643 // and its case folding-equivalent runes to the class r.
1644 func appendFoldedRange(r []rune, lo, hi rune) []rune {
1645         // Optimizations.
1646         if lo <= MinFold && hi >= MaxFold {
1647                 // Range is full: folding can't add more.
1648                 return AppendRange(r, lo, hi)
1649         }
1650         if hi < MinFold || lo > MaxFold {
1651                 // Range is outside folding possibilities.
1652                 return AppendRange(r, lo, hi)
1653         }
1654         if lo < MinFold {
1655                 // [lo, MinFold-1] needs no folding.
1656                 r = AppendRange(r, lo, MinFold-1)
1657                 lo = MinFold
1658         }
1659         if hi > MaxFold {
1660                 // [MaxFold+1, hi] needs no folding.
1661                 r = AppendRange(r, MaxFold+1, hi)
1662                 hi = MaxFold
1663         }
1664
1665         // Brute force.  Depend on AppendRange to coalesce ranges on the fly.
1666         for c := lo; c <= hi; c++ {
1667                 r = AppendRange(r, c, c)
1668                 f := unicode.SimpleFold(c)
1669                 for f != c {
1670                         r = AppendRange(r, f, f)
1671                         f = unicode.SimpleFold(f)
1672                 }
1673         }
1674         return r
1675 }
1676
1677 // appendClass returns the result of appending the class x to the class r.
1678 // It assume x is clean.
1679 func appendClass(r []rune, x []rune) []rune {
1680         for i := 0; i < len(x); i += 2 {
1681                 r = AppendRange(r, x[i], x[i+1])
1682         }
1683         return r
1684 }
1685
1686 // appendFolded returns the result of appending the case folding of the class x to the class r.
1687 func appendFoldedClass(r []rune, x []rune) []rune {
1688         for i := 0; i < len(x); i += 2 {
1689                 r = appendFoldedRange(r, x[i], x[i+1])
1690         }
1691         return r
1692 }
1693
1694 // appendNegatedClass returns the result of appending the negation of the class x to the class r.
1695 // It assumes x is clean.
1696 func appendNegatedClass(r []rune, x []rune) []rune {
1697         nextLo := rune('\u0000')
1698         for i := 0; i < len(x); i += 2 {
1699                 lo, hi := x[i], x[i+1]
1700                 if nextLo <= lo-1 {
1701                         r = AppendRange(r, nextLo, lo-1)
1702                 }
1703                 nextLo = hi + 1
1704         }
1705         if nextLo <= unicode.MaxRune {
1706                 r = AppendRange(r, nextLo, unicode.MaxRune)
1707         }
1708         return r
1709 }
1710
1711 // appendTable returns the result of appending x to the class r.
1712 func appendTable(r []rune, x *unicode.RangeTable) []rune {
1713         for _, xr := range x.R16 {
1714                 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1715                 if stride == 1 {
1716                         r = AppendRange(r, lo, hi)
1717                         continue
1718                 }
1719                 for c := lo; c <= hi; c += stride {
1720                         r = AppendRange(r, c, c)
1721                 }
1722         }
1723         for _, xr := range x.R32 {
1724                 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1725                 if stride == 1 {
1726                         r = AppendRange(r, lo, hi)
1727                         continue
1728                 }
1729                 for c := lo; c <= hi; c += stride {
1730                         r = AppendRange(r, c, c)
1731                 }
1732         }
1733         return r
1734 }
1735
1736 // appendNegatedTable returns the result of appending the negation of x to the class r.
1737 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1738         nextLo := rune('\u0000') // lo end of next class to add
1739         for _, xr := range x.R16 {
1740                 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1741                 if stride == 1 {
1742                         if nextLo <= lo-1 {
1743                                 r = AppendRange(r, nextLo, lo-1)
1744                         }
1745                         nextLo = hi + 1
1746                         continue
1747                 }
1748                 for c := lo; c <= hi; c += stride {
1749                         if nextLo <= c-1 {
1750                                 r = AppendRange(r, nextLo, c-1)
1751                         }
1752                         nextLo = c + 1
1753                 }
1754         }
1755         for _, xr := range x.R32 {
1756                 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1757                 if stride == 1 {
1758                         if nextLo <= lo-1 {
1759                                 r = AppendRange(r, nextLo, lo-1)
1760                         }
1761                         nextLo = hi + 1
1762                         continue
1763                 }
1764                 for c := lo; c <= hi; c += stride {
1765                         if nextLo <= c-1 {
1766                                 r = AppendRange(r, nextLo, c-1)
1767                         }
1768                         nextLo = c + 1
1769                 }
1770         }
1771         if nextLo <= unicode.MaxRune {
1772                 r = AppendRange(r, nextLo, unicode.MaxRune)
1773         }
1774         return r
1775 }
1776
1777 // negateClass overwrites r and returns r's negation.
1778 // It assumes the class r is already clean.
1779 func negateClass(r []rune) []rune {
1780         nextLo := rune('\u0000') // lo end of next class to add
1781         w := 0                   // write index
1782         for i := 0; i < len(r); i += 2 {
1783                 lo, hi := r[i], r[i+1]
1784                 if nextLo <= lo-1 {
1785                         r[w] = nextLo
1786                         r[w+1] = lo - 1
1787                         w += 2
1788                 }
1789                 nextLo = hi + 1
1790         }
1791         r = r[:w]
1792         if nextLo <= unicode.MaxRune {
1793                 // It's possible for the negation to have one more
1794                 // range - this one - than the original class, so use append.
1795                 r = append(r, nextLo, unicode.MaxRune)
1796         }
1797         return r
1798 }
1799
1800 // ranges implements sort.Interface on a []rune.
1801 // The choice of receiver type definition is strange
1802 // but avoids an allocation since we already have
1803 // a *[]rune.
1804 type ranges struct {
1805         p *[]rune
1806 }
1807
1808 func (ra ranges) Less(i, j int) bool {
1809         p := *ra.p
1810         i *= 2
1811         j *= 2
1812         return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1813 }
1814
1815 func (ra ranges) Len() int {
1816         return len(*ra.p) / 2
1817 }
1818
1819 func (ra ranges) Swap(i, j int) {
1820         p := *ra.p
1821         i *= 2
1822         j *= 2
1823         p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1824 }
1825
1826 func checkUTF8(s string) error {
1827         for s != "" {
1828                 rune, size := utf8.DecodeRuneInString(s)
1829                 if rune == utf8.RuneError && size == 1 {
1830                         return &Error{Code: ErrInvalidUTF8, Expr: s}
1831                 }
1832                 s = s[size:]
1833         }
1834         return nil
1835 }
1836
1837 func nextRune(s string) (c rune, t string, err error) {
1838         c, size := utf8.DecodeRuneInString(s)
1839         if c == utf8.RuneError && size == 1 {
1840                 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1841         }
1842         return c, s[size:], nil
1843 }
1844
1845 func isalnum(c rune) bool {
1846         return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1847 }
1848
1849 func unhex(c rune) rune {
1850         if '0' <= c && c <= '9' {
1851                 return c - '0'
1852         }
1853         if 'a' <= c && c <= 'f' {
1854                 return c - 'a' + 10
1855         }
1856         if 'A' <= c && c <= 'F' {
1857                 return c - 'A' + 10
1858         }
1859         return -1
1860 }