libjava/classpath/gnu/java/util/regex/RE.java

   1 /* gnu/regexp/RE.java
   2    Copyright (C) 2006 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package gnu.java.util.regex;
  39 import java.io.InputStream;
  40 import java.io.Serializable;
  41 import java.util.Locale;
  42 import java.util.PropertyResourceBundle;
  43 import java.util.ResourceBundle;
  44 import java.util.Stack;
  45 import java.util.Vector;
  46
  47 /**
  48  * RE provides the user interface for compiling and matching regular
  49  * expressions.
  50  * <P>
  51  * A regular expression object (class RE) is compiled by constructing it
  52  * from a String, StringBuffer or character array, with optional
  53  * compilation flags (below)
  54  * and an optional syntax specification (see RESyntax; if not specified,
  55  * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
  56  * <P>
  57  * Once compiled, a regular expression object is reusable as well as
  58  * threadsafe: multiple threads can use the RE instance simultaneously
  59  * to match against different input text.
  60  * <P>
  61  * Various methods attempt to match input text against a compiled
  62  * regular expression.  These methods are:
  63  * <LI><code>isMatch</code>: returns true if the input text in its
  64  * entirety matches the regular expression pattern.
  65  * <LI><code>getMatch</code>: returns the first match found in the
  66  * input text, or null if no match is found.
  67  * <LI><code>getAllMatches</code>: returns an array of all
  68  * non-overlapping matches found in the input text.  If no matches are
  69  * found, the array is zero-length.
  70  * <LI><code>substitute</code>: substitute the first occurence of the
  71  * pattern in the input text with a replacement string (which may
  72  * include metacharacters $0-$9, see REMatch.substituteInto).
  73  * <LI><code>substituteAll</code>: same as above, but repeat for each
  74  * match before returning.
  75  * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
  76  * object that allows iteration over the matches (see
  77  * REMatchEnumeration for some reasons why you may want to do this
  78  * instead of using <code>getAllMatches</code>.
  79  * <P>
  80  *
  81  * These methods all have similar argument lists.  The input can be a
  82  * CharIndexed, String, a character array, a StringBuffer, or an
  83  * InputStream of some sort.  Note that when using an
  84  * InputStream, the stream read position cannot be guaranteed after
  85  * attempting a match (this is not a bug, but a consequence of the way
  86  * regular expressions work).  Using an REMatchEnumeration can
  87  * eliminate most positioning problems.
  88  *
  89  * Although the input object can be of various types, it is recommended
  90  * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
  91  * can show the last match found on this input, which helps the expression
  92  * \G work as the end of the previous match.
  93  *
  94  * <P>
  95  *
  96  * The optional index argument specifies the offset from the beginning
  97  * of the text at which the search should start (see the descriptions
  98  * of some of the execution flags for how this can affect positional
  99  * pattern operators).  For an InputStream, this means an
 100  * offset from the current read position, so subsequent calls with the
 101  * same index argument on an InputStream will not
 102  * necessarily access the same position on the stream, whereas
 103  * repeated searches at a given index in a fixed string will return
 104  * consistent results.
 105  *
 106  * <P>
 107  * You can optionally affect the execution environment by using a
 108  * combination of execution flags (constants listed below).
 109  *
 110  * <P>
 111  * All operations on a regular expression are performed in a
 112  * thread-safe manner.
 113  *
 114  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
 115  * @version 1.1.5-dev, to be released
 116  */
 117
 118 public class RE extends REToken {
 119
 120   private static final class IntPair implements Serializable {
 121     public int first, second;
 122   }
 123
 124   private static final class CharUnit implements Serializable {
 125     public char ch;
 126     public boolean bk;
 127   }
 128
 129   // This String will be returned by getVersion()
 130   private static final String VERSION = "1.1.5-dev";
 131
 132   // The localized strings are kept in a separate file
 133   // Used by getLocalizedMessage().
 134   private static ResourceBundle messages;
 135
 136   // Name of the bundle that contains the localized messages.
 137   private static final String bundle = "gnu/java/util/regex/MessagesBundle";
 138
 139   // These are, respectively, the first and last tokens in our linked list
 140   // If there is only one token, firstToken == lastToken
 141   private REToken firstToken, lastToken;
 142
 143   // This is the number of subexpressions in this regular expression,
 144   // with a minimum value of zero.  Returned by getNumSubs()
 145   private int numSubs;
 146
 147     /** Minimum length, in characters, of any possible match. */
 148     private int minimumLength;
 149     private int maximumLength;
 150
 151   /**
 152    * Compilation flag. Do  not  differentiate  case.   Subsequent
 153    * searches  using  this  RE will be case insensitive.
 154    */
 155   public static final int REG_ICASE = 0x02;
 156
 157   /**
 158    * Compilation flag. The match-any-character operator (dot)
 159    * will match a newline character.  When set this overrides the syntax
 160    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
 161    * the "/s" operator in Perl.
 162    */
 163   public static final int REG_DOT_NEWLINE = 0x04;
 164
 165   /**
 166    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
 167    * anchors will match based on newlines within the input. This is
 168    * equivalent to the "/m" operator in Perl.
 169    */
 170   public static final int REG_MULTILINE = 0x08;
 171
 172   /**
 173    * Execution flag.
 174    * The match-beginning operator (^) will not match at the beginning
 175    * of the input string. Useful for matching on a substring when you
 176    * know the context of the input is such that position zero of the
 177    * input to the match test is not actually position zero of the text.
 178    * <P>
 179    * This example demonstrates the results of various ways of matching on
 180    * a substring.
 181    * <P>
 182    * <CODE>
 183    * String s = "food bar fool";<BR>
 184    * RE exp = new RE("^foo.");<BR>
 185    * REMatch m0 = exp.getMatch(s);<BR>
 186    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
 187    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
 188    * REMatch m3 = exp.getMatch(s,8);                            <BR>
 189    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
 190    * <P>
 191    * // Results:<BR>
 192    * //  m0.toString(): "food"<BR>
 193    * //  m1.toString(): "fool"<BR>
 194    * //  m2.toString(): null<BR>
 195    * //  m3.toString(): null<BR>
 196    * //  m4.toString(): "fool"<BR>
 197    * </CODE>
 198    */
 199   public static final int REG_NOTBOL = 0x10;
 200
 201   /**
 202    * Execution flag.
 203    * The match-end operator ($) does not match at the end
 204    * of the input string. Useful for matching on substrings.
 205    */
 206   public static final int REG_NOTEOL = 0x20;
 207
 208   /**
 209    * Execution flag.
 210    * When a match method is invoked that starts matching at a non-zero
 211    * index into the input, treat the input as if it begins at the index
 212    * given.  The effect of this flag is that the engine does not "see"
 213    * any text in the input before the given index.  This is useful so
 214    * that the match-beginning operator (^) matches not at position 0
 215    * in the input string, but at the position the search started at
 216    * (based on the index input given to the getMatch function).  See
 217    * the example under REG_NOTBOL.  It also affects the use of the \&lt;
 218    * and \b operators.
 219    */
 220   public static final int REG_ANCHORINDEX = 0x40;
 221
 222   /**
 223    * Execution flag.
 224    * The substitute and substituteAll methods will not attempt to
 225    * interpolate occurrences of $1-$9 in the replacement text with
 226    * the corresponding subexpressions.  For example, you may want to
 227    * replace all matches of "one dollar" with "$1".
 228    */
 229   public static final int REG_NO_INTERPOLATE = 0x80;
 230
 231   /**
 232    * Execution flag.
 233    * Try to match the whole input string. An implicit match-end operator
 234    * is added to this regexp.
 235    */
 236   public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
 237
 238   /**
 239    * Execution flag.
 240    * The substitute and substituteAll methods will treat the
 241    * character '\' in the replacement as an escape to a literal
 242    * character. In this case "\n", "\$", "\\", "\x40" and "\012"
 243    * will become "n", "$", "\", "x40" and "012" respectively.
 244    * This flag has no effect if REG_NO_INTERPOLATE is set on.
 245    */
 246   public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
 247
 248   /**
 249    * Compilation flag. Allow whitespace and comments in pattern.
 250    * This is equivalent to the "/x" operator in Perl.
 251    */
 252   public static final int REG_X_COMMENTS = 0x0400;
 253
 254   /**
 255    * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
 256    */
 257   public static final int REG_ICASE_USASCII = 0x0800;
 258
 259   /**
 260    * Execution flag.
 261    * Do not move the position at which the search begins.  If not set,
 262    * the starting position will be moved until a match is found.
 263    */
 264   public static final int REG_FIX_STARTING_POSITION = 0x1000;
 265
 266   /** Returns a string representing the version of the gnu.regexp package. */
 267   public static final String version() {
 268     return VERSION;
 269   }
 270
 271   // Retrieves a message from the ResourceBundle
 272   static final String getLocalizedMessage(String key) {
 273     if (messages == null)
 274       messages = PropertyResourceBundle.getBundle(bundle, Locale.getDefault());
 275     return messages.getString(key);
 276   }
 277
 278   /**
 279    * Constructs a regular expression pattern buffer without any compilation
 280    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
 281    *
 282    * @param pattern A regular expression pattern, in the form of a String,
 283    *   StringBuffer or char[].  Other input types will be converted to
 284    *   strings using the toString() method.
 285    * @exception REException The input pattern could not be parsed.
 286    * @exception NullPointerException The pattern was null.
 287    */
 288   public RE(Object pattern) throws REException {
 289     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
 290   }
 291
 292   /**
 293    * Constructs a regular expression pattern buffer using the specified
 294    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
 295    *
 296    * @param pattern A regular expression pattern, in the form of a String,
 297    *   StringBuffer, or char[].  Other input types will be converted to
 298    *   strings using the toString() method.
 299    * @param cflags The logical OR of any combination of the compilation flags listed above.
 300    * @exception REException The input pattern could not be parsed.
 301    * @exception NullPointerException The pattern was null.
 302    */
 303   public RE(Object pattern, int cflags) throws REException {
 304     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
 305   }
 306
 307   /**
 308    * Constructs a regular expression pattern buffer using the specified
 309    * compilation flags and regular expression syntax.
 310    *
 311    * @param pattern A regular expression pattern, in the form of a String,
 312    *   StringBuffer, or char[].  Other input types will be converted to
 313    *   strings using the toString() method.
 314    * @param cflags The logical OR of any combination of the compilation flags listed above.
 315    * @param syntax The type of regular expression syntax to use.
 316    * @exception REException The input pattern could not be parsed.
 317    * @exception NullPointerException The pattern was null.
 318    */
 319   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
 320     this(pattern,cflags,syntax,0,0);
 321   }
 322
 323   // internal constructor used for alternation
 324   private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
 325     super(subIndex);
 326     firstToken = first;
 327     lastToken = last;
 328     numSubs = subs;
 329     minimumLength = minLength;
 330     maximumLength = maxLength;
 331     addToken(new RETokenEndSub(subIndex));
 332   }
 333
 334   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 335     super(myIndex); // Subexpression index of this token.
 336     initialize(patternObj, cflags, syntax, myIndex, nextSub);
 337   }
 338
 339     // For use by subclasses
 340     protected RE() { super(0); }
 341
 342     // The meat of construction
 343   protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
 344       char[] pattern;
 345     if (patternObj instanceof String) {
 346       pattern = ((String) patternObj).toCharArray();
 347     } else if (patternObj instanceof char[]) {
 348       pattern = (char[]) patternObj;
 349     } else if (patternObj instanceof StringBuffer) {
 350       pattern = new char [((StringBuffer) patternObj).length()];
 351       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
 352     } else {
 353         pattern = patternObj.toString().toCharArray();
 354     }
 355
 356     int pLength = pattern.length;
 357
 358     numSubs = 0; // Number of subexpressions in this token.
 359     Vector branches = null;
 360
 361     // linked list of tokens (sort of -- some closed loops can exist)
 362     firstToken = lastToken = null;
 363
 364     // Precalculate these so we don't pay for the math every time we
 365     // need to access them.
 366     boolean insens = ((cflags & REG_ICASE) > 0);
 367     boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 368
 369     // Parse pattern into tokens.  Does anyone know if it's more efficient
 370     // to use char[] than a String.charAt()?  I'm assuming so.
 371
 372     // index tracks the position in the char array
 373     int index = 0;
 374
 375     // this will be the current parse character (pattern[index])
 376     CharUnit unit = new CharUnit();
 377
 378     // This is used for {x,y} calculations
 379     IntPair minMax = new IntPair();
 380
 381     // Buffer a token so we can create a TokenRepeated, etc.
 382     REToken currentToken = null;
 383     char ch;
 384     boolean quot = false;
 385
 386     // Saved syntax and flags.
 387     RESyntax savedSyntax = null;
 388     int savedCflags = 0;
 389     boolean flagsSaved = false;
 390
 391     while (index < pLength) {
 392       // read the next character unit (including backslash escapes)
 393       index = getCharUnit(pattern,index,unit,quot);
 394
 395       if (unit.bk)
 396         if (unit.ch == 'Q') {
 397           quot = true;
 398           continue;
 399         } else if (unit.ch == 'E') {
 400           quot = false;
 401           continue;
 402         }
 403       if (quot)
 404         unit.bk = false;
 405
 406       if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) {
 407         if (Character.isWhitespace(unit.ch)) {
 408              continue;
 409         }
 410         if (unit.ch == '#') {
 411           for (int i = index; i < pLength; i++) {
 412             if (pattern[i] == '\n') {
 413               index = i + 1;
 414               continue;
 415             }
 416             else if (pattern[i] == '\r') {
 417               if (i + 1 < pLength && pattern[i + 1] == '\n') {
 418                 index = i + 2;
 419               }
 420               else {
 421                 index = i + 1;
 422               }
 423               continue;
 424             }
 425           }
 426           index = pLength;
 427           continue;
 428         }
 429       }
 430
 431       // ALTERNATION OPERATOR
 432       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
 433       //  not available if RE_LIMITED_OPS is set
 434
 435       // TODO: the '\n' literal here should be a test against REToken.newline,
 436       // which unfortunately may be more than a single character.
 437       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
 438              || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
 439            && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
 440         // make everything up to here be a branch. create vector if nec.
 441         addToken(currentToken);
 442         RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
 443         minimumLength = 0;
 444         maximumLength = 0;
 445         if (branches == null) {
 446             branches = new Vector();
 447         }
 448         branches.addElement(theBranch);
 449         firstToken = lastToken = currentToken = null;
 450       }
 451
 452       // INTERVAL OPERATOR:
 453       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
 454       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
 455       //
 456       // OPEN QUESTION:
 457       //  what is proper interpretation of '{' at start of string?
 458       //
 459       // This method used to check "repeat.empty.token" to avoid such regexp
 460       // as "(a*){2,}", but now "repeat.empty.token" is allowed.
 461
 462       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
 463         int newIndex = getMinMax(pattern,index,minMax,syntax);
 464         if (newIndex > index) {
 465           if (minMax.first > minMax.second)
 466             throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
 467           if (currentToken == null)
 468             throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
 469           if (currentToken instanceof RETokenRepeated)
 470             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
 471           if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 472             throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
 473           index = newIndex;
 474           currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
 475         }
 476         else {
 477           addToken(currentToken);
 478           currentToken = new RETokenChar(subIndex,unit.ch,insens);
 479           if (insensUSASCII) currentToken.unicodeAware = false;
 480         }
 481       }
 482
 483       // LIST OPERATOR:
 484       //  [...] | [^...]
 485
 486       else if ((unit.ch == '[') && !(unit.bk || quot)) {
 487         // Create a new RETokenOneOf
 488         ParseCharClassResult result = parseCharClass(
 489                 subIndex, pattern, index, pLength, cflags, syntax, 0);
 490         addToken(currentToken);
 491         currentToken = result.token;
 492         index = result.index;
 493       }
 494
 495       // SUBEXPRESSIONS
 496       //  (...) | \(...\) depending on RE_NO_BK_PARENS
 497
 498       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
 499         boolean pure = false;
 500         boolean comment = false;
 501         boolean lookAhead = false;
 502         boolean lookBehind = false;
 503         boolean independent = false;
 504         boolean negativelh = false;
 505         boolean negativelb = false;
 506         if ((index+1 < pLength) && (pattern[index] == '?')) {
 507           switch (pattern[index+1]) {
 508           case '!':
 509             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 510               pure = true;
 511               negativelh = true;
 512               lookAhead = true;
 513               index += 2;
 514             }
 515             break;
 516           case '=':
 517             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 518               pure = true;
 519               lookAhead = true;
 520               index += 2;
 521             }
 522             break;
 523           case '<':
 524             // We assume that if the syntax supports look-ahead,
 525             // it also supports look-behind.
 526             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 527                 index++;
 528                 switch (pattern[index +1]) {
 529                 case '!':
 530                   pure = true;
 531                   negativelb = true;
 532                   lookBehind = true;
 533                   index += 2;
 534                   break;
 535                 case '=':
 536                   pure = true;
 537                   lookBehind = true;
 538                   index += 2;
 539                 }
 540             }
 541             break;
 542           case '>':
 543             // We assume that if the syntax supports look-ahead,
 544             // it also supports independent group.
 545             if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
 546               pure = true;
 547               independent = true;
 548               index += 2;
 549             }
 550             break;
 551           case 'i':
 552           case 'd':
 553           case 'm':
 554           case 's':
 555           case 'u':
 556           case 'x':
 557           case '-':
 558             if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
 559             // Set or reset syntax flags.
 560             int flagIndex = index + 1;
 561             int endFlag = -1;
 562             RESyntax newSyntax = new RESyntax(syntax);
 563             int newCflags = cflags;
 564             boolean negate = false;
 565             while (flagIndex < pLength && endFlag < 0) {
 566                 switch(pattern[flagIndex]) {
 567                 case 'i':
 568                   if (negate)
 569                     newCflags &= ~REG_ICASE;
 570                   else
 571                     newCflags |= REG_ICASE;
 572                   flagIndex++;
 573                   break;
 574                 case 'd':
 575                   if (negate)
 576                     newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
 577                   else
 578                     newSyntax.setLineSeparator("\n");
 579                   flagIndex++;
 580                   break;
 581                 case 'm':
 582                   if (negate)
 583                     newCflags &= ~REG_MULTILINE;
 584                   else
 585                     newCflags |= REG_MULTILINE;
 586                   flagIndex++;
 587                   break;
 588                 case 's':
 589                   if (negate)
 590                     newCflags &= ~REG_DOT_NEWLINE;
 591                   else
 592                     newCflags |= REG_DOT_NEWLINE;
 593                   flagIndex++;
 594                   break;
 595                 case 'u':
 596                   if (negate)
 597                     newCflags |= REG_ICASE_USASCII;
 598                   else
 599                     newCflags &= ~REG_ICASE_USASCII;
 600                   flagIndex++;
 601                   break;
 602                 case 'x':
 603                   if (negate)
 604                     newCflags &= ~REG_X_COMMENTS;
 605                   else
 606                     newCflags |= REG_X_COMMENTS;
 607                   flagIndex++;
 608                   break;
 609                 case '-':
 610                   negate = true;
 611                   flagIndex++;
 612                   break;
 613                 case ':':
 614                 case ')':
 615                   endFlag = pattern[flagIndex];
 616                   break;
 617                 default:
 618                   throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 619                 }
 620             }
 621             if (endFlag == ')') {
 622                 syntax = newSyntax;
 623                 cflags = newCflags;
 624                 insens = ((cflags & REG_ICASE) > 0);
 625                 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 626                 // This can be treated as though it were a comment.
 627                 comment = true;
 628                 index = flagIndex - 1;
 629                 break;
 630             }
 631             if (endFlag == ':') {
 632                 savedSyntax = syntax;
 633                 savedCflags = cflags;
 634                 flagsSaved = true;
 635                 syntax = newSyntax;
 636                 cflags = newCflags;
 637                 insens = ((cflags & REG_ICASE) > 0);
 638                 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 639                 index = flagIndex -1;
 640                 // Fall through to the next case.
 641             }
 642             else {
 643                 throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 644             }
 645           case ':':
 646             if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
 647               pure = true;
 648               index += 2;
 649             }
 650             break;
 651           case '#':
 652             if (syntax.get(RESyntax.RE_COMMENTS)) {
 653               comment = true;
 654             }
 655             break;
 656           default:
 657             throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
 658           }
 659         }
 660
 661         if (index >= pLength) {
 662             throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
 663         }
 664
 665         // find end of subexpression
 666         int endIndex = index;
 667         int nextIndex = index;
 668         int nested = 0;
 669
 670         while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
 671                 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) {
 672           if ((endIndex = nextIndex) >= pLength)
 673             throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 674           else if ((unit.ch == '[') && !(unit.bk || quot)) {
 675             // I hate to do something similar to the LIST OPERATOR matters
 676             // above, but ...
 677             int listIndex = nextIndex;
 678             if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 679             if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 680             int listEndIndex = -1;
 681             int listNest = 0;
 682             while (listIndex < pLength && listEndIndex < 0) {
 683               switch(pattern[listIndex++]) {
 684                 case '\\':
 685                   listIndex++;
 686                   break;
 687                 case '[':
 688                   // Sun's API document says that regexp like "[a-d[m-p]]"
 689                   // is legal. Even something like "[[[^]]]]" is accepted.
 690                   listNest++;
 691                   if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
 692                   if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
 693                   break;
 694                 case ']':
 695                   if (listNest == 0)
 696                     listEndIndex = listIndex;
 697                   listNest--;
 698                   break;
 699               }
 700             }
 701             if (listEndIndex >= 0) {
 702               nextIndex = listEndIndex;
 703               if ((endIndex = nextIndex) >= pLength)
 704                 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 705               else
 706                 continue;
 707             }
 708             throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
 709           }
 710           else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 711             nested++;
 712           else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
 713             nested--;
 714         }
 715
 716         // endIndex is now position at a ')','\)'
 717         // nextIndex is end of string or position after ')' or '\)'
 718
 719         if (comment) index = nextIndex;
 720         else { // not a comment
 721           // create RE subexpression as token.
 722           addToken(currentToken);
 723           if (!pure) {
 724             numSubs++;
 725           }
 726
 727           int useIndex = (pure || lookAhead || lookBehind || independent) ?
 728                          0 : nextSub + numSubs;
 729           currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
 730           numSubs += ((RE) currentToken).getNumSubs();
 731
 732           if (lookAhead) {
 733               currentToken = new RETokenLookAhead(currentToken,negativelh);
 734           }
 735           else if (lookBehind) {
 736               currentToken = new RETokenLookBehind(currentToken,negativelb);
 737           }
 738           else if (independent) {
 739               currentToken = new RETokenIndependent(currentToken);
 740           }
 741
 742           index = nextIndex;
 743           if (flagsSaved) {
 744               syntax = savedSyntax;
 745               cflags = savedCflags;
 746               insens = ((cflags & REG_ICASE) > 0);
 747               insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 748               flagsSaved = false;
 749           }
 750         } // not a comment
 751       } // subexpression
 752
 753       // UNMATCHED RIGHT PAREN
 754       // ) or \) throw exception if
 755       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
 756       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
 757         throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
 758       }
 759
 760       // START OF LINE OPERATOR
 761       //  ^
 762
 763       else if ((unit.ch == '^') && !(unit.bk || quot)) {
 764         addToken(currentToken);
 765         currentToken = null;
 766         RETokenStart token = null;
 767         if ((cflags & REG_MULTILINE) > 0) {
 768             String sep = syntax.getLineSeparator();
 769             if (sep == null) {
 770                 token = new RETokenStart(subIndex, null, true);
 771             }
 772             else {
 773                 token = new RETokenStart(subIndex, sep);
 774             }
 775         }
 776         else {
 777             token = new RETokenStart(subIndex, null);
 778         }
 779         addToken(token);
 780       }
 781
 782       // END OF LINE OPERATOR
 783       //  $
 784
 785       else if ((unit.ch == '$') && !(unit.bk || quot)) {
 786         addToken(currentToken);
 787         currentToken = null;
 788         RETokenEnd token = null;
 789         if ((cflags & REG_MULTILINE) > 0) {
 790             String sep = syntax.getLineSeparator();
 791             if (sep == null) {
 792                 token = new RETokenEnd(subIndex, null, true);
 793             }
 794             else {
 795                 token = new RETokenEnd(subIndex, sep);
 796             }
 797         }
 798         else {
 799             token = new RETokenEnd(subIndex, null);
 800         }
 801         addToken(token);
 802       }
 803
 804       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
 805       //  .
 806
 807       else if ((unit.ch == '.') && !(unit.bk || quot)) {
 808         addToken(currentToken);
 809         currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
 810       }
 811
 812       // ZERO-OR-MORE REPEAT OPERATOR
 813       //  *
 814       //
 815       // This method used to check "repeat.empty.token" to avoid such regexp
 816       // as "(a*)*", but now "repeat.empty.token" is allowed.
 817
 818       else if ((unit.ch == '*') && !(unit.bk || quot)) {
 819         if (currentToken == null)
 820           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 821         if (currentToken instanceof RETokenRepeated)
 822           throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 823         if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 824           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 825         currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
 826       }
 827
 828       // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
 829       //  + | \+ depending on RE_BK_PLUS_QM
 830       //  not available if RE_LIMITED_OPS is set
 831       //
 832       // This method used to check "repeat.empty.token" to avoid such regexp
 833       // as "(a*)+", but now "repeat.empty.token" is allowed.
 834
 835       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 836         if (currentToken == null)
 837           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 838
 839         // Check for possessive matching on RETokenRepeated
 840         if (currentToken instanceof RETokenRepeated) {
 841           RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 842           if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
 843             tokenRep.makePossessive();
 844           else
 845             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 846
 847         }
 848         else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 849           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 850         else
 851           currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
 852       }
 853
 854       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
 855       //  ? | \? depending on RE_BK_PLUS_QM
 856       //  not available if RE_LIMITED_OPS is set
 857       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
 858
 859       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 860         if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
 861
 862         // Check for stingy matching on RETokenRepeated
 863         if (currentToken instanceof RETokenRepeated) {
 864           RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
 865           if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
 866             tokenRep.makeStingy();
 867           else
 868             throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
 869         }
 870         else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 871           throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 872         else
 873           currentToken = setRepeated(currentToken,0,1,index);
 874       }
 875
 876       // OCTAL CHARACTER
 877       //  \0377
 878
 879       else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
 880         CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
 881         if (ce == null)
 882           throw new REException("invalid octal character", REException.REG_ESCAPE, index);
 883         index = index - 2 + ce.len;
 884         addToken(currentToken);
 885         currentToken = new RETokenChar(subIndex,ce.ch,insens);
 886         if (insensUSASCII) currentToken.unicodeAware = false;
 887       }
 888
 889       // BACKREFERENCE OPERATOR
 890       //  \1 \2 ... \9 and \10 \11 \12 ...
 891       // not available if RE_NO_BK_REFS is set
 892       // Perl recognizes \10, \11, and so on only if enough number of
 893       // parentheses have opened before it, otherwise they are treated
 894       // as aliases of \010, \011, ... (octal characters).  In case of
 895       // Sun's JDK, octal character expression must always begin with \0.
 896       // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
 897       // JDK treats \2 as a back reference to the 2nd group because
 898       // there are only two groups. But in our poor implementation,
 899       // we cannot help but treat \29 as a back reference to the 29th group.
 900
 901       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
 902         addToken(currentToken);
 903         int numBegin = index - 1;
 904         int numEnd = pLength;
 905         for (int i = index; i < pLength; i++) {
 906             if (! Character.isDigit(pattern[i])) {
 907                 numEnd = i;
 908                 break;
 909             }
 910         }
 911         int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
 912
 913         currentToken = new RETokenBackRef(subIndex,num,insens);
 914         if (insensUSASCII) currentToken.unicodeAware = false;
 915         index = numEnd;
 916       }
 917
 918       // START OF STRING OPERATOR
 919       //  \A if RE_STRING_ANCHORS is set
 920
 921       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 922         addToken(currentToken);
 923         currentToken = new RETokenStart(subIndex,null);
 924       }
 925
 926       // WORD BREAK OPERATOR
 927       //  \b if ????
 928
 929       else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 930           addToken(currentToken);
 931           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
 932       }
 933
 934       // WORD BEGIN OPERATOR
 935       //  \< if ????
 936       else if (unit.bk && (unit.ch == '<')) {
 937           addToken(currentToken);
 938           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
 939       }
 940
 941       // WORD END OPERATOR
 942       //  \> if ????
 943       else if (unit.bk && (unit.ch == '>')) {
 944           addToken(currentToken);
 945           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
 946       }
 947
 948       // NON-WORD BREAK OPERATOR
 949       // \B if ????
 950
 951       else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
 952           addToken(currentToken);
 953           currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
 954       }
 955
 956
 957       // DIGIT OPERATOR
 958       //  \d if RE_CHAR_CLASS_ESCAPES is set
 959
 960       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 961         addToken(currentToken);
 962         currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
 963         if (insensUSASCII) currentToken.unicodeAware = false;
 964       }
 965
 966       // NON-DIGIT OPERATOR
 967       //  \D
 968
 969         else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 970           addToken(currentToken);
 971           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
 972           if (insensUSASCII) currentToken.unicodeAware = false;
 973         }
 974
 975         // NEWLINE ESCAPE
 976         //  \n
 977
 978         else if (unit.bk && (unit.ch == 'n')) {
 979           addToken(currentToken);
 980           currentToken = new RETokenChar(subIndex,'\n',false);
 981         }
 982
 983         // RETURN ESCAPE
 984         //  \r
 985
 986         else if (unit.bk && (unit.ch == 'r')) {
 987           addToken(currentToken);
 988           currentToken = new RETokenChar(subIndex,'\r',false);
 989         }
 990
 991         // WHITESPACE OPERATOR
 992         //  \s if RE_CHAR_CLASS_ESCAPES is set
 993
 994         else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
 995           addToken(currentToken);
 996           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
 997           if (insensUSASCII) currentToken.unicodeAware = false;
 998         }
 999
1000         // NON-WHITESPACE OPERATOR
1001         //  \S
1002
1003         else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1004           addToken(currentToken);
1005           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
1006           if (insensUSASCII) currentToken.unicodeAware = false;
1007         }
1008
1009         // TAB ESCAPE
1010         //  \t
1011
1012         else if (unit.bk && (unit.ch == 't')) {
1013           addToken(currentToken);
1014           currentToken = new RETokenChar(subIndex,'\t',false);
1015         }
1016
1017         // ALPHANUMERIC OPERATOR
1018         //  \w
1019
1020         else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1021           addToken(currentToken);
1022           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
1023           if (insensUSASCII) currentToken.unicodeAware = false;
1024         }
1025
1026         // NON-ALPHANUMERIC OPERATOR
1027         //  \W
1028
1029         else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1030           addToken(currentToken);
1031           currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
1032           if (insensUSASCII) currentToken.unicodeAware = false;
1033         }
1034
1035         // END OF STRING OPERATOR
1036         //  \Z, \z
1037
1038         // FIXME: \Z and \z are different in that if the input string
1039         // ends with a line terminator, \Z matches the position before
1040         // the final terminator.  This special behavior of \Z is yet
1041         // to be implemented.
1042
1043         else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1044                  syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1045           addToken(currentToken);
1046           currentToken = new RETokenEnd(subIndex,null);
1047         }
1048
1049         // HEX CHARACTER, UNICODE CHARACTER
1050         //  \x1B, \u1234
1051
1052         else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1053                  (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1054           CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
1055           if (ce == null)
1056             throw new REException("invalid hex character", REException.REG_ESCAPE, index);
1057           index = index - 2 + ce.len;
1058           addToken(currentToken);
1059           currentToken = new RETokenChar(subIndex,ce.ch,insens);
1060           if (insensUSASCII) currentToken.unicodeAware = false;
1061         }
1062
1063         // NAMED PROPERTY
1064         // \p{prop}, \P{prop}
1065
1066         else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
1067                  (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
1068           NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
1069           if (np == null)
1070               throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1071           index = index - 2 + np.len;
1072           addToken(currentToken);
1073           currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
1074           if (insensUSASCII) currentToken.unicodeAware = false;
1075         }
1076
1077         // END OF PREVIOUS MATCH
1078         //  \G
1079
1080         else if (unit.bk && (unit.ch == 'G') &&
1081                  syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1082           addToken(currentToken);
1083           currentToken = new RETokenEndOfPreviousMatch(subIndex);
1084         }
1085
1086         // NON-SPECIAL CHARACTER (or escape to make literal)
1087         //  c | \* for example
1088
1089         else {  // not a special character
1090           addToken(currentToken);
1091           currentToken = new RETokenChar(subIndex,unit.ch,insens);
1092           if (insensUSASCII) currentToken.unicodeAware = false;
1093         }
1094       } // end while
1095
1096     // Add final buffered token and an EndSub marker
1097     addToken(currentToken);
1098
1099     if (branches != null) {
1100         branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
1101         branches.trimToSize(); // compact the Vector
1102         minimumLength = 0;
1103         maximumLength = 0;
1104         firstToken = lastToken = null;
1105         addToken(new RETokenOneOf(subIndex,branches,false));
1106     }
1107     else addToken(new RETokenEndSub(subIndex));
1108
1109   }
1110
1111   private static class ParseCharClassResult {
1112       RETokenOneOf token;
1113       int index;
1114       boolean returnAtAndOperator = false;
1115   }
1116
1117   /**
1118    * Parse [...] or [^...] and make an RETokenOneOf instance.
1119    * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1120    * @param pattern Input array of characters to be parsed.
1121    * @param index Index pointing to the character next to the beginning '['.
1122    * @param pLength Limit of the input array.
1123    * @param cflags Compilation flags used to parse the pattern.
1124    * @param pflags Flags that affect the behavior of this method.
1125    * @param syntax Syntax used to parse the pattern.
1126    */
1127   private static ParseCharClassResult parseCharClass(int subIndex,
1128                 char[] pattern, int index,
1129                 int pLength, int cflags, RESyntax syntax, int pflags)
1130                 throws REException {
1131
1132         boolean insens = ((cflags & REG_ICASE) > 0);
1133         boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1134         Vector options = new Vector();
1135         Vector addition = new Vector();
1136         boolean additionAndAppeared = false;
1137         final int RETURN_AT_AND = 0x01;
1138         boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1139         boolean negative = false;
1140         char ch;
1141
1142         char lastChar = 0;
1143         boolean lastCharIsSet = false;
1144         if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
1145
1146         // Check for initial caret, negation
1147         if ((ch = pattern[index]) == '^') {
1148           negative = true;
1149           if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1150           ch = pattern[index];
1151         }
1152
1153         // Check for leading right bracket literal
1154         if (ch == ']') {
1155           lastChar = ch; lastCharIsSet = true;
1156           if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1157         }
1158
1159         while ((ch = pattern[index++]) != ']') {
1160           if ((ch == '-') && (lastCharIsSet)) {
1161             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1162             if ((ch = pattern[index]) == ']') {
1163               RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1164               if (insensUSASCII) t.unicodeAware = false;
1165               options.addElement(t);
1166               lastChar = '-';
1167             } else {
1168               if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1169                 CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
1170                 if (ce == null)
1171                   throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1172                 ch = ce.ch;
1173                 index = index + ce.len - 1;
1174               }
1175               RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
1176               if (insensUSASCII) t.unicodeAware = false;
1177               options.addElement(t);
1178               lastChar = 0; lastCharIsSet = false;
1179               index++;
1180             }
1181           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1182             if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1183             int posixID = -1;
1184             boolean negate = false;
1185             char asciiEsc = 0;
1186             boolean asciiEscIsSet = false;
1187             NamedProperty np = null;
1188             if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
1189               switch (pattern[index]) {
1190               case 'D':
1191                 negate = true;
1192               case 'd':
1193                 posixID = RETokenPOSIX.DIGIT;
1194                 break;
1195               case 'S':
1196                 negate = true;
1197               case 's':
1198                 posixID = RETokenPOSIX.SPACE;
1199                 break;
1200               case 'W':
1201                 negate = true;
1202               case 'w':
1203                 posixID = RETokenPOSIX.ALNUM;
1204                 break;
1205               }
1206             }
1207             if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
1208               np = getNamedProperty(pattern, index - 1, pLength);
1209               if (np == null)
1210                 throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1211               index = index - 1 + np.len - 1;
1212             }
1213             else {
1214               CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
1215               if (ce == null)
1216                 throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1217               asciiEsc = ce.ch; asciiEscIsSet = true;
1218               index = index - 1 + ce.len - 1;
1219             }
1220             if (lastCharIsSet) {
1221               RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1222               if (insensUSASCII) t.unicodeAware = false;
1223               options.addElement(t);
1224             }
1225
1226             if (posixID != -1) {
1227               RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
1228               if (insensUSASCII) t.unicodeAware = false;
1229               options.addElement(t);
1230             } else if (np != null) {
1231               RETokenNamedProperty t = getRETokenNamedProperty(subIndex,np,insens,index);
1232               if (insensUSASCII) t.unicodeAware = false;
1233               options.addElement(t);
1234             } else if (asciiEscIsSet) {
1235               lastChar = asciiEsc; lastCharIsSet = true;
1236             } else {
1237               lastChar = pattern[index]; lastCharIsSet = true;
1238             }
1239             ++index;
1240           } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
1241             StringBuffer posixSet = new StringBuffer();
1242             index = getPosixSet(pattern,index+1,posixSet);
1243             int posixId = RETokenPOSIX.intValue(posixSet.toString());
1244             if (posixId != -1) {
1245               RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
1246               if (insensUSASCII) t.unicodeAware = false;
1247               options.addElement(t);
1248             }
1249           } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
1250                 ParseCharClassResult result = parseCharClass(
1251                     subIndex, pattern, index, pLength, cflags, syntax, 0);
1252                 addition.addElement(result.token);
1253                 addition.addElement("|");
1254                 index = result.index;
1255           } else if ((ch == '&') &&
1256                      (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
1257                      (index < pLength) && (pattern[index] == '&')) {
1258                 if (returnAtAndOperator) {
1259                     ParseCharClassResult result = new ParseCharClassResult();
1260                     options.trimToSize();
1261                     if (additionAndAppeared) addition.addElement("&");
1262                     if (addition.size() == 0) addition = null;
1263                     result.token = new RETokenOneOf(subIndex,
1264                         options, addition, negative);
1265                     result.index = index - 1;
1266                     result.returnAtAndOperator = true;
1267                     return result;
1268                 }
1269                 // The precedence of the operator "&&" is the lowest.
1270                 // So we postpone adding "&" until other elements
1271                 // are added. And we insert Boolean.FALSE at the
1272                 // beginning of the list of tokens following "&&".
1273                 // So, "&&[a-b][k-m]" will be stored in the Vecter
1274                 // addition in this order:
1275                 //     Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1276                 if (additionAndAppeared) addition.addElement("&");
1277                 addition.addElement(Boolean.FALSE);
1278                 additionAndAppeared = true;
1279
1280                 // The part on which "&&" operates may be either
1281                 //   (1) explicitly enclosed by []
1282                 //   or
1283                 //   (2) not enclosed by [] and terminated by the
1284                 //       next "&&" or the end of the character list.
1285                 //  Let the preceding else if block do the case (1).
1286                 //  We must do something in case of (2).
1287                 if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
1288                     ParseCharClassResult result = parseCharClass(
1289                         subIndex, pattern, index+1, pLength, cflags, syntax,
1290                         RETURN_AT_AND);
1291                     addition.addElement(result.token);
1292                     addition.addElement("|");
1293                     // If the method returned at the next "&&", it is OK.
1294                     // Otherwise we have eaten the mark of the end of this
1295                     // character list "]".  In this case we must give back
1296                     // the end mark.
1297                     index = (result.returnAtAndOperator ?
1298                         result.index: result.index - 1);
1299                 }
1300           } else {
1301             if (lastCharIsSet) {
1302               RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1303               if (insensUSASCII) t.unicodeAware = false;
1304               options.addElement(t);
1305             }
1306             lastChar = ch; lastCharIsSet = true;
1307           }
1308           if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1309         } // while in list
1310         // Out of list, index is one past ']'
1311
1312         if (lastCharIsSet) {
1313           RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1314           if (insensUSASCII) t.unicodeAware = false;
1315           options.addElement(t);
1316         }
1317
1318         ParseCharClassResult result = new ParseCharClassResult();
1319         // Create a new RETokenOneOf
1320         options.trimToSize();
1321         if (additionAndAppeared) addition.addElement("&");
1322         if (addition.size() == 0) addition = null;
1323         result.token = new RETokenOneOf(subIndex,options, addition, negative);
1324         result.index = index;
1325         return result;
1326   }
1327
1328   private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
1329     unit.ch = input[index++];
1330     unit.bk = (unit.ch == '\\'
1331                && (!quot || index >= input.length || input[index] == 'E'));
1332     if (unit.bk)
1333       if (index < input.length)
1334         unit.ch = input[index++];
1335       else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
1336     return index;
1337   }
1338
1339   private static int parseInt(char[] input, int pos, int len, int radix) {
1340     int ret = 0;
1341     for (int i = pos; i < pos + len; i++) {
1342         ret = ret * radix + Character.digit(input[i], radix);
1343     }
1344     return ret;
1345   }
1346
1347   /**
1348    * This class represents various expressions for a character.
1349    * "a"      : 'a' itself.
1350    * "\0123"  : Octal char 0123
1351    * "\x1b"   : Hex char 0x1b
1352    * "\u1234" : Unicode char \u1234
1353    */
1354   private static class CharExpression {
1355     /** character represented by this expression */
1356     char ch;
1357     /** String expression */
1358     String expr;
1359     /** length of this expression */
1360     int len;
1361     public String toString() { return expr; }
1362   }
1363
1364   private static CharExpression getCharExpression(char[] input, int pos, int lim,
1365         RESyntax syntax) {
1366     CharExpression ce = new CharExpression();
1367     char c = input[pos];
1368     if (c == '\\') {
1369       if (pos + 1 >= lim) return null;
1370       c = input[pos + 1];
1371       switch(c) {
1372       case 't':
1373         ce.ch = '\t';
1374         ce.len = 2;
1375         break;
1376       case 'n':
1377         ce.ch = '\n';
1378         ce.len = 2;
1379         break;
1380       case 'r':
1381         ce.ch = '\r';
1382         ce.len = 2;
1383         break;
1384       case 'x':
1385       case 'u':
1386         if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1387             (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1388           int l = 0;
1389           int expectedLength = (c == 'x' ? 2 : 4);
1390           for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
1391             if (i >= lim) break;
1392             if (!((input[i] >= '0' && input[i] <= '9') ||
1393                   (input[i] >= 'A' && input[i] <= 'F') ||
1394                   (input[i] >= 'a' && input[i] <= 'f')))
1395                 break;
1396             l++;
1397           }
1398           if (l != expectedLength) return null;
1399           ce.ch = (char)(parseInt(input, pos + 2, l, 16));
1400           ce.len = l + 2;
1401         }
1402         else {
1403           ce.ch = c;
1404           ce.len = 2;
1405         }
1406         break;
1407       case '0':
1408         if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
1409           int l = 0;
1410           for (int i = pos + 2; i < pos + 2 + 3; i++) {
1411             if (i >= lim) break;
1412             if (input[i] < '0' || input[i] > '7') break;
1413             l++;
1414           }
1415           if (l == 3 && input[pos + 2] > '3') l--;
1416           if (l <= 0) return null;
1417           ce.ch = (char)(parseInt(input, pos + 2, l, 8));
1418           ce.len = l + 2;
1419         }
1420         else {
1421           ce.ch = c;
1422           ce.len = 2;
1423         }
1424         break;
1425       default:
1426         ce.ch = c;
1427         ce.len = 2;
1428         break;
1429       }
1430     }
1431     else {
1432       ce.ch = input[pos];
1433       ce.len = 1;
1434     }
1435     ce.expr = new String(input, pos, ce.len);
1436     return ce;
1437   }
1438
1439   /**
1440    * This class represents a substring in a pattern string expressing
1441    * a named property.
1442    * "\pA"      : Property named "A"
1443    * "\p{prop}" : Property named "prop"
1444    * "\PA"      : Property named "A" (Negated)
1445    * "\P{prop}" : Property named "prop" (Negated)
1446    */
1447   private static class NamedProperty {
1448     /** Property name */
1449     String name;
1450     /** Negated or not */
1451     boolean negate;
1452     /** length of this expression */
1453     int len;
1454   }
1455
1456   private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
1457     NamedProperty np = new NamedProperty();
1458     char c = input[pos];
1459     if (c == '\\') {
1460       if (++pos >= lim) return null;
1461       c = input[pos++];
1462       switch(c) {
1463       case 'p':
1464         np.negate = false;
1465         break;
1466       case 'P':
1467         np.negate = true;
1468         break;
1469       default:
1470         return null;
1471       }
1472       c = input[pos++];
1473       if (c == '{') {
1474           int p = -1;
1475           for (int i = pos; i < lim; i++) {
1476               if (input[i] == '}') {
1477                   p = i;
1478                   break;
1479               }
1480           }
1481           if (p < 0) return null;
1482           int len = p - pos;
1483           np.name = new String(input, pos, len);
1484           np.len = len + 4;
1485       }
1486       else {
1487           np.name = new String(input, pos - 1, 1);
1488           np.len = 3;
1489       }
1490       return np;
1491     }
1492     else return null;
1493   }
1494
1495   private static RETokenNamedProperty getRETokenNamedProperty(
1496       int subIndex, NamedProperty np, boolean insens, int index)
1497       throws REException {
1498     try {
1499         return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
1500     }
1501     catch (REException e) {
1502         REException ree;
1503         ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
1504         ree.initCause(e);
1505         throw ree;
1506     }
1507   }
1508
1509   /**
1510    * Checks if the regular expression matches the input in its entirety.
1511    *
1512    * @param input The input text.
1513    */
1514   public boolean isMatch(Object input) {
1515     return isMatch(input,0,0);
1516   }
1517
1518   /**
1519    * Checks if the input string, starting from index, is an exact match of
1520    * this regular expression.
1521    *
1522    * @param input The input text.
1523    * @param index The offset index at which the search should be begin.
1524    */
1525   public boolean isMatch(Object input,int index) {
1526     return isMatch(input,index,0);
1527   }
1528
1529
1530   /**
1531    * Checks if the input, starting from index and using the specified
1532    * execution flags, is an exact match of this regular expression.
1533    *
1534    * @param input The input text.
1535    * @param index The offset index at which the search should be begin.
1536    * @param eflags The logical OR of any execution flags above.
1537    */
1538   public boolean isMatch(Object input,int index,int eflags) {
1539     return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1540   }
1541
1542   private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1543     if (firstToken == null)  // Trivial case
1544       return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1545     REMatch m = new REMatch(numSubs, index, eflags);
1546     if (firstToken.match(input, m)) {
1547         if (m != null) {
1548             if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1549                 return true;
1550             }
1551         }
1552     }
1553     return false;
1554   }
1555
1556   /**
1557    * Returns the maximum number of subexpressions in this regular expression.
1558    * If the expression contains branches, the value returned will be the
1559    * maximum subexpressions in any of the branches.
1560    */
1561   public int getNumSubs() {
1562     return numSubs;
1563   }
1564
1565   // Overrides REToken.setUncle
1566   void setUncle(REToken uncle) {
1567       if (lastToken != null) {
1568           lastToken.setUncle(uncle);
1569       } else super.setUncle(uncle); // to deal with empty subexpressions
1570   }
1571
1572   // Overrides REToken.chain
1573
1574   boolean chain(REToken next) {
1575     super.chain(next);
1576     setUncle(next);
1577     return true;
1578   }
1579
1580   /**
1581    * Returns the minimum number of characters that could possibly
1582    * constitute a match of this regular expression.
1583    */
1584   public int getMinimumLength() {
1585       return minimumLength;
1586   }
1587
1588   public int getMaximumLength() {
1589       return maximumLength;
1590   }
1591
1592   /**
1593    * Returns an array of all matches found in the input.
1594    *
1595    * If the regular expression allows the empty string to match, it will
1596    * substitute matches at all positions except the end of the input.
1597    *
1598    * @param input The input text.
1599    * @return a non-null (but possibly zero-length) array of matches
1600    */
1601   public REMatch[] getAllMatches(Object input) {
1602     return getAllMatches(input,0,0);
1603   }
1604
1605   /**
1606    * Returns an array of all matches found in the input,
1607    * beginning at the specified index position.
1608    *
1609    * If the regular expression allows the empty string to match, it will
1610    * substitute matches at all positions except the end of the input.
1611    *
1612    * @param input The input text.
1613    * @param index The offset index at which the search should be begin.
1614    * @return a non-null (but possibly zero-length) array of matches
1615    */
1616   public REMatch[] getAllMatches(Object input, int index) {
1617     return getAllMatches(input,index,0);
1618   }
1619
1620   /**
1621    * Returns an array of all matches found in the input string,
1622    * beginning at the specified index position and using the specified
1623    * execution flags.
1624    *
1625    * If the regular expression allows the empty string to match, it will
1626    * substitute matches at all positions except the end of the input.
1627    *
1628    * @param input The input text.
1629    * @param index The offset index at which the search should be begin.
1630    * @param eflags The logical OR of any execution flags above.
1631    * @return a non-null (but possibly zero-length) array of matches
1632    */
1633   public REMatch[] getAllMatches(Object input, int index, int eflags) {
1634     return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
1635   }
1636
1637   // this has been changed since 1.03 to be non-overlapping matches
1638   private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
1639     Vector all = new Vector();
1640     REMatch m = null;
1641     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
1642       all.addElement(m);
1643       index = m.getEndIndex();
1644       if (m.end[0] == 0) {   // handle pathological case of zero-length match
1645         index++;
1646         input.move(1);
1647       } else {
1648         input.move(m.end[0]);
1649       }
1650       if (!input.isValid()) break;
1651     }
1652     REMatch[] mset = new REMatch[all.size()];
1653     all.copyInto(mset);
1654     return mset;
1655   }
1656
1657     /* Implements abstract method REToken.match() */
1658     boolean match(CharIndexed input, REMatch mymatch) {
1659         input.setHitEnd(mymatch);
1660         if (firstToken == null) {
1661             return next(input, mymatch);
1662         }
1663
1664         // Note the start of this subexpression
1665         mymatch.start1[subIndex] = mymatch.index;
1666
1667         return firstToken.match(input, mymatch);
1668     }
1669
1670     REMatch findMatch(CharIndexed input, REMatch mymatch) {
1671         if (mymatch.backtrackStack == null)
1672           mymatch.backtrackStack = new BacktrackStack();
1673         boolean b = match(input, mymatch);
1674         if (b) {
1675             return mymatch;
1676         }
1677         return null;
1678     }
1679
1680   /**
1681    * Returns the first match found in the input.  If no match is found,
1682    * null is returned.
1683    *
1684    * @param input The input text.
1685    * @return An REMatch instance referencing the match, or null if none.
1686    */
1687   public REMatch getMatch(Object input) {
1688     return getMatch(input,0,0);
1689   }
1690
1691   /**
1692    * Returns the first match found in the input, beginning
1693    * the search at the specified index.  If no match is found,
1694    * returns null.
1695    *
1696    * @param input The input text.
1697    * @param index The offset within the text to begin looking for a match.
1698    * @return An REMatch instance referencing the match, or null if none.
1699    */
1700   public REMatch getMatch(Object input, int index) {
1701     return getMatch(input,index,0);
1702   }
1703
1704   /**
1705    * Returns the first match found in the input, beginning
1706    * the search at the specified index, and using the specified
1707    * execution flags.  If no match is found, returns null.
1708    *
1709    * @param input The input text.
1710    * @param index The offset index at which the search should be begin.
1711    * @param eflags The logical OR of any execution flags above.
1712    * @return An REMatch instance referencing the match, or null if none.
1713    */
1714   public REMatch getMatch(Object input, int index, int eflags) {
1715     return getMatch(input,index,eflags,null);
1716   }
1717
1718   /**
1719    * Returns the first match found in the input, beginning the search
1720    * at the specified index, and using the specified execution flags.
1721    * If no match is found, returns null.  If a StringBuffer is
1722    * provided and is non-null, the contents of the input text from the
1723    * index to the beginning of the match (or to the end of the input,
1724    * if there is no match) are appended to the StringBuffer.
1725    *
1726    * @param input The input text.
1727    * @param index The offset index at which the search should be begin.
1728    * @param eflags The logical OR of any execution flags above.
1729    * @param buffer The StringBuffer to save pre-match text in.
1730    * @return An REMatch instance referencing the match, or null if none.  */
1731   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1732     return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1733   }
1734
1735   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1736       boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
1737       boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
1738       RE re = (tryEntireMatch ? (RE) this.clone() : this);
1739       if (tryEntireMatch) {
1740           RETokenEnd reEnd = new RETokenEnd(0, null);
1741           reEnd.setFake(true);
1742           re.chain(reEnd);
1743       }
1744       // Create a new REMatch to hold results
1745       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1746       do {
1747           /* The following potimization is commented out because
1748              the matching should be tried even if the length of
1749              input is obviously too short in order that
1750              java.util.regex.Matcher#hitEnd() may work correctly.
1751           // Optimization: check if anchor + minimumLength > length
1752           if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1753           */
1754               if (re.match(input, mymatch)) {
1755                   REMatch best = mymatch;
1756                   // We assume that the match that coms first is the best.
1757                   // And the following "The longer, the better" rule has
1758                   // been commented out. The longest is not neccesarily
1759                   // the best. For example, "a" out of "aaa" is the best
1760                   // match for /a+?/.
1761                   /*
1762                   // Find best match of them all to observe leftmost longest
1763                   while ((mymatch = mymatch.next) != null) {
1764                       if (mymatch.index > best.index) {
1765                         best = mymatch;
1766                       }
1767                   }
1768                   */
1769                   best.end[0] = best.index;
1770                   best.finish(input);
1771                   input.setLastMatch(best);
1772                   return best;
1773               }
1774           /* End of the optimization commented out
1775           }
1776           */
1777           mymatch.clear(++anchor);
1778           // Append character to buffer if needed
1779           if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1780               buffer.append(input.charAt(0));
1781           }
1782       // java.util.regex.Matcher#hitEnd() requires that the search should
1783       // be tried at the end of input, so we use move1(1) instead of move(1)
1784       } while (doMove && input.move1(1));
1785
1786       // Special handling at end of input for e.g. "$"
1787       if (minimumLength == 0) {
1788           if (match(input, mymatch)) {
1789               mymatch.finish(input);
1790               return mymatch;
1791           }
1792       }
1793
1794       return null;
1795   }
1796
1797   /**
1798    * Returns an REMatchEnumeration that can be used to iterate over the
1799    * matches found in the input text.
1800    *
1801    * @param input The input text.
1802    * @return A non-null REMatchEnumeration instance.
1803    */
1804   public REMatchEnumeration getMatchEnumeration(Object input) {
1805     return getMatchEnumeration(input,0,0);
1806   }
1807
1808
1809   /**
1810    * Returns an REMatchEnumeration that can be used to iterate over the
1811    * matches found in the input text.
1812    *
1813    * @param input The input text.
1814    * @param index The offset index at which the search should be begin.
1815    * @return A non-null REMatchEnumeration instance, with its input cursor
1816    *  set to the index position specified.
1817    */
1818   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1819     return getMatchEnumeration(input,index,0);
1820   }
1821
1822   /**
1823    * Returns an REMatchEnumeration that can be used to iterate over the
1824    * matches found in the input text.
1825    *
1826    * @param input The input text.
1827    * @param index The offset index at which the search should be begin.
1828    * @param eflags The logical OR of any execution flags above.
1829    * @return A non-null REMatchEnumeration instance, with its input cursor
1830    *  set to the index position specified.
1831    */
1832   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1833     return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1834   }
1835
1836
1837   /**
1838    * Substitutes the replacement text for the first match found in the input.
1839    *
1840    * @param input The input text.
1841    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1842    * @return A String interpolating the substituted text.
1843    * @see REMatch#substituteInto
1844    */
1845   public String substitute(Object input,String replace) {
1846     return substitute(input,replace,0,0);
1847   }
1848
1849   /**
1850    * Substitutes the replacement text for the first match found in the input
1851    * beginning at the specified index position.  Specifying an index
1852    * effectively causes the regular expression engine to throw away the
1853    * specified number of characters.
1854    *
1855    * @param input The input text.
1856    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1857    * @param index The offset index at which the search should be begin.
1858    * @return A String containing the substring of the input, starting
1859    *   at the index position, and interpolating the substituted text.
1860    * @see REMatch#substituteInto
1861    */
1862   public String substitute(Object input,String replace,int index) {
1863     return substitute(input,replace,index,0);
1864   }
1865
1866   /**
1867    * Substitutes the replacement text for the first match found in the input
1868    * string, beginning at the specified index position and using the
1869    * specified execution flags.
1870    *
1871    * @param input The input text.
1872    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1873    * @param index The offset index at which the search should be begin.
1874    * @param eflags The logical OR of any execution flags above.
1875    * @return A String containing the substring of the input, starting
1876    *   at the index position, and interpolating the substituted text.
1877    * @see REMatch#substituteInto
1878    */
1879   public String substitute(Object input,String replace,int index,int eflags) {
1880     return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1881   }
1882
1883   private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1884     StringBuffer buffer = new StringBuffer();
1885     REMatch m = getMatchImpl(input,index,eflags,buffer);
1886     if (m==null) return buffer.toString();
1887     buffer.append(getReplacement(replace, m, eflags));
1888     if (input.move(m.end[0])) {
1889       do {
1890         buffer.append(input.charAt(0));
1891       } while (input.move(1));
1892     }
1893     return buffer.toString();
1894   }
1895
1896   /**
1897    * Substitutes the replacement text for each non-overlapping match found
1898    * in the input text.
1899    *
1900    * @param input The input text.
1901    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1902    * @return A String interpolating the substituted text.
1903    * @see REMatch#substituteInto
1904    */
1905   public String substituteAll(Object input,String replace) {
1906     return substituteAll(input,replace,0,0);
1907   }
1908
1909   /**
1910    * Substitutes the replacement text for each non-overlapping match found
1911    * in the input text, starting at the specified index.
1912    *
1913    * If the regular expression allows the empty string to match, it will
1914    * substitute matches at all positions except the end of the input.
1915    *
1916    * @param input The input text.
1917    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1918    * @param index The offset index at which the search should be begin.
1919    * @return A String containing the substring of the input, starting
1920    *   at the index position, and interpolating the substituted text.
1921    * @see REMatch#substituteInto
1922    */
1923   public String substituteAll(Object input,String replace,int index) {
1924     return substituteAll(input,replace,index,0);
1925   }
1926
1927   /**
1928    * Substitutes the replacement text for each non-overlapping match found
1929    * in the input text, starting at the specified index and using the
1930    * specified execution flags.
1931    *
1932    * @param input The input text.
1933    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1934    * @param index The offset index at which the search should be begin.
1935    * @param eflags The logical OR of any execution flags above.
1936    * @return A String containing the substring of the input, starting
1937    *   at the index position, and interpolating the substituted text.
1938    * @see REMatch#substituteInto
1939    */
1940   public String substituteAll(Object input,String replace,int index,int eflags) {
1941     return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1942   }
1943
1944   private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1945     StringBuffer buffer = new StringBuffer();
1946     REMatch m;
1947     while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1948       buffer.append(getReplacement(replace, m, eflags));
1949       index = m.getEndIndex();
1950       if (m.end[0] == 0) {
1951         char ch = input.charAt(0);
1952         if (ch != CharIndexed.OUT_OF_BOUNDS)
1953             buffer.append(ch);
1954         input.move(1);
1955       } else {
1956           input.move(m.end[0]);
1957       }
1958
1959       if (!input.isValid()) break;
1960     }
1961     return buffer.toString();
1962   }
1963
1964   public static String getReplacement(String replace, REMatch m, int eflags) {
1965     if ((eflags & REG_NO_INTERPOLATE) > 0)
1966       return replace;
1967     else {
1968       if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
1969         StringBuffer sb = new StringBuffer();
1970         int l = replace.length();
1971         for (int i = 0; i < l; i++) {
1972             char c = replace.charAt(i);
1973             switch(c) {
1974             case '\\':
1975               i++;
1976               // Let StringIndexOutOfBoundsException be thrown.
1977               sb.append(replace.charAt(i));
1978               break;
1979             case '$':
1980               int i1 = i + 1;
1981               while (i1 < replace.length() &&
1982                 Character.isDigit(replace.charAt(i1))) i1++;
1983               sb.append(m.substituteInto(replace.substring(i, i1)));
1984               i = i1 - 1;
1985               break;
1986             default:
1987               sb.append(c);
1988             }
1989         }
1990         return sb.toString();
1991       }
1992       else
1993         return m.substituteInto(replace);
1994     }
1995   }
1996
1997   /* Helper function for constructor */
1998   private void addToken(REToken next) {
1999     if (next == null) return;
2000     minimumLength += next.getMinimumLength();
2001     int nmax = next.getMaximumLength();
2002     if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2003         maximumLength += nmax;
2004     else
2005         maximumLength = Integer.MAX_VALUE;
2006
2007     if (firstToken == null) {
2008         lastToken = firstToken = next;
2009     } else {
2010       // if chain returns false, it "rejected" the token due to
2011       // an optimization, and next was combined with lastToken
2012       if (lastToken.chain(next)) {
2013           lastToken = next;
2014       }
2015     }
2016   }
2017
2018   private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
2019     if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
2020     return new RETokenRepeated(current.subIndex,current,min,max);
2021   }
2022
2023   private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
2024     // Precondition: pattern[index-1] == ':'
2025     // we will return pos of closing ']'.
2026     int i;
2027     for (i=index; i<(pattern.length-1); i++) {
2028       if ((pattern[i] == ':') && (pattern[i+1] == ']'))
2029         return i+2;
2030       buf.append(pattern[i]);
2031     }
2032     return index; // didn't match up
2033   }
2034
2035   private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
2036     // Precondition: input[index-1] == '{', minMax != null
2037
2038     boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
2039     int startIndex = index;
2040     if (index == input.length) {
2041       if (mustMatch)
2042         throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
2043       else
2044         return startIndex;
2045     }
2046
2047     int min,max=0;
2048     CharUnit unit = new CharUnit();
2049     StringBuffer buf = new StringBuffer();
2050
2051     // Read string of digits
2052     do {
2053       index = getCharUnit(input,index,unit,false);
2054       if (Character.isDigit(unit.ch))
2055         buf.append(unit.ch);
2056     } while ((index != input.length) && Character.isDigit(unit.ch));
2057
2058     // Check for {} tomfoolery
2059     if (buf.length() == 0) {
2060       if (mustMatch)
2061         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2062       else
2063         return startIndex;
2064     }
2065
2066     min = Integer.parseInt(buf.toString());
2067
2068     if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2069       max = min;
2070     else if (index == input.length)
2071       if (mustMatch)
2072         throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
2073       else
2074         return startIndex;
2075     else if ((unit.ch == ',') && !unit.bk) {
2076       buf = new StringBuffer();
2077       // Read string of digits
2078       while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch))
2079         buf.append(unit.ch);
2080
2081       if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2082         if (mustMatch)
2083           throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2084         else
2085           return startIndex;
2086
2087       // This is the case of {x,}
2088       if (buf.length() == 0) max = Integer.MAX_VALUE;
2089       else max = Integer.parseInt(buf.toString());
2090     } else
2091       if (mustMatch)
2092         throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2093       else
2094         return startIndex;
2095
2096     // We know min and max now, and they are valid.
2097
2098     minMax.first = min;
2099     minMax.second = max;
2100
2101     // return the index following the '}'
2102     return index;
2103   }
2104
2105    /**
2106     * Return a human readable form of the compiled regular expression,
2107     * useful for debugging.
2108     */
2109    public String toString() {
2110      StringBuffer sb = new StringBuffer();
2111      dump(sb);
2112      return sb.toString();
2113    }
2114
2115   void dump(StringBuffer os) {
2116     os.append("(?#startRE subIndex=" + subIndex + ")");
2117     if (subIndex == 0)
2118       os.append("?:");
2119     if (firstToken != null)
2120       firstToken.dumpAll(os);
2121     if (subIndex == 0)
2122       os.append(")");
2123     os.append("(?#endRE subIndex=" + subIndex + ")");
2124   }
2125
2126   // Cast input appropriately or throw exception
2127   // This method was originally a private method, but has been made
2128   // public because java.util.regex.Matcher uses this.
2129   public static CharIndexed makeCharIndexed(Object input, int index) {
2130       // The case where input is already a CharIndexed is supposed
2131       // be the most likely because this is the case with
2132       // java.util.regex.Matcher.
2133       // We could let a String or a CharSequence fall through
2134       // to final input, but since it'a very likely input type,
2135       // we check it first.
2136     if (input instanceof CharIndexed) {
2137         CharIndexed ci = (CharIndexed) input;
2138         ci.setAnchor(index);
2139         return ci;
2140     }
2141     else if (input instanceof CharSequence)
2142       return new CharIndexedCharSequence((CharSequence) input,index);
2143     else if (input instanceof String)
2144       return new CharIndexedString((String) input,index);
2145     else if (input instanceof char[])
2146       return new CharIndexedCharArray((char[]) input,index);
2147     else if (input instanceof StringBuffer)
2148       return new CharIndexedStringBuffer((StringBuffer) input,index);
2149     else if (input instanceof InputStream)
2150       return new CharIndexedInputStream((InputStream) input,index);
2151     else
2152         return new CharIndexedString(input.toString(), index);
2153   }
2154 }