2 Copyright (C) 2006 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu.java.util.regex;
39 import java.io.InputStream;
40 import java.io.Serializable;
41 import java.util.Locale;
42 import java.util.PropertyResourceBundle;
43 import java.util.ResourceBundle;
44 import java.util.Stack;
45 import java.util.Vector;
48 * RE provides the user interface for compiling and matching regular
51 * A regular expression object (class RE) is compiled by constructing it
52 * from a String, StringBuffer or character array, with optional
53 * compilation flags (below)
54 * and an optional syntax specification (see RESyntax; if not specified,
55 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
57 * Once compiled, a regular expression object is reusable as well as
58 * threadsafe: multiple threads can use the RE instance simultaneously
59 * to match against different input text.
61 * Various methods attempt to match input text against a compiled
62 * regular expression. These methods are:
63 * <LI><code>isMatch</code>: returns true if the input text in its
64 * entirety matches the regular expression pattern.
65 * <LI><code>getMatch</code>: returns the first match found in the
66 * input text, or null if no match is found.
67 * <LI><code>getAllMatches</code>: returns an array of all
68 * non-overlapping matches found in the input text. If no matches are
69 * found, the array is zero-length.
70 * <LI><code>substitute</code>: substitute the first occurence of the
71 * pattern in the input text with a replacement string (which may
72 * include metacharacters $0-$9, see REMatch.substituteInto).
73 * <LI><code>substituteAll</code>: same as above, but repeat for each
74 * match before returning.
75 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration
76 * object that allows iteration over the matches (see
77 * REMatchEnumeration for some reasons why you may want to do this
78 * instead of using <code>getAllMatches</code>.
81 * These methods all have similar argument lists. The input can be a
82 * CharIndexed, String, a character array, a StringBuffer, or an
83 * InputStream of some sort. Note that when using an
84 * InputStream, the stream read position cannot be guaranteed after
85 * attempting a match (this is not a bug, but a consequence of the way
86 * regular expressions work). Using an REMatchEnumeration can
87 * eliminate most positioning problems.
89 * Although the input object can be of various types, it is recommended
90 * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()}
91 * can show the last match found on this input, which helps the expression
92 * \G work as the end of the previous match.
96 * The optional index argument specifies the offset from the beginning
97 * of the text at which the search should start (see the descriptions
98 * of some of the execution flags for how this can affect positional
99 * pattern operators). For an InputStream, this means an
100 * offset from the current read position, so subsequent calls with the
101 * same index argument on an InputStream will not
102 * necessarily access the same position on the stream, whereas
103 * repeated searches at a given index in a fixed string will return
104 * consistent results.
107 * You can optionally affect the execution environment by using a
108 * combination of execution flags (constants listed below).
111 * All operations on a regular expression are performed in a
112 * thread-safe manner.
114 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
115 * @version 1.1.5-dev, to be released
118 public class RE extends REToken {
120 private static final class IntPair implements Serializable {
121 public int first, second;
124 private static final class CharUnit implements Serializable {
129 // This String will be returned by getVersion()
130 private static final String VERSION = "1.1.5-dev";
132 // The localized strings are kept in a separate file
133 // Used by getLocalizedMessage().
134 private static ResourceBundle messages;
136 // Name of the bundle that contains the localized messages.
137 private static final String bundle = "gnu/java/util/regex/MessagesBundle";
139 // These are, respectively, the first and last tokens in our linked list
140 // If there is only one token, firstToken == lastToken
141 private REToken firstToken, lastToken;
143 // This is the number of subexpressions in this regular expression,
144 // with a minimum value of zero. Returned by getNumSubs()
147 /** Minimum length, in characters, of any possible match. */
148 private int minimumLength;
149 private int maximumLength;
152 * Compilation flag. Do not differentiate case. Subsequent
153 * searches using this RE will be case insensitive.
155 public static final int REG_ICASE = 0x02;
158 * Compilation flag. The match-any-character operator (dot)
159 * will match a newline character. When set this overrides the syntax
160 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
161 * the "/s" operator in Perl.
163 public static final int REG_DOT_NEWLINE = 0x04;
166 * Compilation flag. Use multiline mode. In this mode, the ^ and $
167 * anchors will match based on newlines within the input. This is
168 * equivalent to the "/m" operator in Perl.
170 public static final int REG_MULTILINE = 0x08;
174 * The match-beginning operator (^) will not match at the beginning
175 * of the input string. Useful for matching on a substring when you
176 * know the context of the input is such that position zero of the
177 * input to the match test is not actually position zero of the text.
179 * This example demonstrates the results of various ways of matching on
183 * String s = "food bar fool";<BR>
184 * RE exp = new RE("^foo.");<BR>
185 * REMatch m0 = exp.getMatch(s);<BR>
186 * REMatch m1 = exp.getMatch(s.substring(8));<BR>
187 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
188 * REMatch m3 = exp.getMatch(s,8); <BR>
189 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
192 * // m0.toString(): "food"<BR>
193 * // m1.toString(): "fool"<BR>
194 * // m2.toString(): null<BR>
195 * // m3.toString(): null<BR>
196 * // m4.toString(): "fool"<BR>
199 public static final int REG_NOTBOL = 0x10;
203 * The match-end operator ($) does not match at the end
204 * of the input string. Useful for matching on substrings.
206 public static final int REG_NOTEOL = 0x20;
210 * When a match method is invoked that starts matching at a non-zero
211 * index into the input, treat the input as if it begins at the index
212 * given. The effect of this flag is that the engine does not "see"
213 * any text in the input before the given index. This is useful so
214 * that the match-beginning operator (^) matches not at position 0
215 * in the input string, but at the position the search started at
216 * (based on the index input given to the getMatch function). See
217 * the example under REG_NOTBOL. It also affects the use of the \<
220 public static final int REG_ANCHORINDEX = 0x40;
224 * The substitute and substituteAll methods will not attempt to
225 * interpolate occurrences of $1-$9 in the replacement text with
226 * the corresponding subexpressions. For example, you may want to
227 * replace all matches of "one dollar" with "$1".
229 public static final int REG_NO_INTERPOLATE = 0x80;
233 * Try to match the whole input string. An implicit match-end operator
234 * is added to this regexp.
236 public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
240 * The substitute and substituteAll methods will treat the
241 * character '\' in the replacement as an escape to a literal
242 * character. In this case "\n", "\$", "\\", "\x40" and "\012"
243 * will become "n", "$", "\", "x40" and "012" respectively.
244 * This flag has no effect if REG_NO_INTERPOLATE is set on.
246 public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
249 * Compilation flag. Allow whitespace and comments in pattern.
250 * This is equivalent to the "/x" operator in Perl.
252 public static final int REG_X_COMMENTS = 0x0400;
255 * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
257 public static final int REG_ICASE_USASCII = 0x0800;
261 * Do not move the position at which the search begins. If not set,
262 * the starting position will be moved until a match is found.
264 public static final int REG_FIX_STARTING_POSITION = 0x1000;
266 /** Returns a string representing the version of the gnu.regexp package. */
267 public static final String version() {
271 // Retrieves a message from the ResourceBundle
272 static final String getLocalizedMessage(String key) {
273 if (messages == null)
274 messages = PropertyResourceBundle.getBundle(bundle, Locale.getDefault());
275 return messages.getString(key);
279 * Constructs a regular expression pattern buffer without any compilation
280 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
282 * @param pattern A regular expression pattern, in the form of a String,
283 * StringBuffer or char[]. Other input types will be converted to
284 * strings using the toString() method.
285 * @exception REException The input pattern could not be parsed.
286 * @exception NullPointerException The pattern was null.
288 public RE(Object pattern) throws REException {
289 this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
293 * Constructs a regular expression pattern buffer using the specified
294 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
296 * @param pattern A regular expression pattern, in the form of a String,
297 * StringBuffer, or char[]. Other input types will be converted to
298 * strings using the toString() method.
299 * @param cflags The logical OR of any combination of the compilation flags listed above.
300 * @exception REException The input pattern could not be parsed.
301 * @exception NullPointerException The pattern was null.
303 public RE(Object pattern, int cflags) throws REException {
304 this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
308 * Constructs a regular expression pattern buffer using the specified
309 * compilation flags and regular expression syntax.
311 * @param pattern A regular expression pattern, in the form of a String,
312 * StringBuffer, or char[]. Other input types will be converted to
313 * strings using the toString() method.
314 * @param cflags The logical OR of any combination of the compilation flags listed above.
315 * @param syntax The type of regular expression syntax to use.
316 * @exception REException The input pattern could not be parsed.
317 * @exception NullPointerException The pattern was null.
319 public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
320 this(pattern,cflags,syntax,0,0);
323 // internal constructor used for alternation
324 private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
329 minimumLength = minLength;
330 maximumLength = maxLength;
331 addToken(new RETokenEndSub(subIndex));
334 private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
335 super(myIndex); // Subexpression index of this token.
336 initialize(patternObj, cflags, syntax, myIndex, nextSub);
339 // For use by subclasses
340 protected RE() { super(0); }
342 // The meat of construction
343 protected void initialize(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
345 if (patternObj instanceof String) {
346 pattern = ((String) patternObj).toCharArray();
347 } else if (patternObj instanceof char[]) {
348 pattern = (char[]) patternObj;
349 } else if (patternObj instanceof StringBuffer) {
350 pattern = new char [((StringBuffer) patternObj).length()];
351 ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
353 pattern = patternObj.toString().toCharArray();
356 int pLength = pattern.length;
358 numSubs = 0; // Number of subexpressions in this token.
359 Vector branches = null;
361 // linked list of tokens (sort of -- some closed loops can exist)
362 firstToken = lastToken = null;
364 // Precalculate these so we don't pay for the math every time we
365 // need to access them.
366 boolean insens = ((cflags & REG_ICASE) > 0);
367 boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
369 // Parse pattern into tokens. Does anyone know if it's more efficient
370 // to use char[] than a String.charAt()? I'm assuming so.
372 // index tracks the position in the char array
375 // this will be the current parse character (pattern[index])
376 CharUnit unit = new CharUnit();
378 // This is used for {x,y} calculations
379 IntPair minMax = new IntPair();
381 // Buffer a token so we can create a TokenRepeated, etc.
382 REToken currentToken = null;
384 boolean quot = false;
386 // Saved syntax and flags.
387 RESyntax savedSyntax = null;
389 boolean flagsSaved = false;
391 while (index < pLength) {
392 // read the next character unit (including backslash escapes)
393 index = getCharUnit(pattern,index,unit,quot);
396 if (unit.ch == 'Q') {
399 } else if (unit.ch == 'E') {
406 if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) {
407 if (Character.isWhitespace(unit.ch)) {
410 if (unit.ch == '#') {
411 for (int i = index; i < pLength; i++) {
412 if (pattern[i] == '\n') {
416 else if (pattern[i] == '\r') {
417 if (i + 1 < pLength && pattern[i + 1] == '\n') {
431 // ALTERNATION OPERATOR
432 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
433 // not available if RE_LIMITED_OPS is set
435 // TODO: the '\n' literal here should be a test against REToken.newline,
436 // which unfortunately may be more than a single character.
437 if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ (unit.bk || quot)))
438 || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !(unit.bk || quot)) )
439 && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
440 // make everything up to here be a branch. create vector if nec.
441 addToken(currentToken);
442 RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
445 if (branches == null) {
446 branches = new Vector();
448 branches.addElement(theBranch);
449 firstToken = lastToken = currentToken = null;
452 // INTERVAL OPERATOR:
453 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
454 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
457 // what is proper interpretation of '{' at start of string?
459 // This method used to check "repeat.empty.token" to avoid such regexp
460 // as "(a*){2,}", but now "repeat.empty.token" is allowed.
462 else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ (unit.bk || quot))) {
463 int newIndex = getMinMax(pattern,index,minMax,syntax);
464 if (newIndex > index) {
465 if (minMax.first > minMax.second)
466 throw new REException(getLocalizedMessage("interval.order"),REException.REG_BADRPT,newIndex);
467 if (currentToken == null)
468 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,newIndex);
469 if (currentToken instanceof RETokenRepeated)
470 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,newIndex);
471 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
472 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,newIndex);
474 currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
477 addToken(currentToken);
478 currentToken = new RETokenChar(subIndex,unit.ch,insens);
479 if (insensUSASCII) currentToken.unicodeAware = false;
486 else if ((unit.ch == '[') && !(unit.bk || quot)) {
487 // Create a new RETokenOneOf
488 ParseCharClassResult result = parseCharClass(
489 subIndex, pattern, index, pLength, cflags, syntax, 0);
490 addToken(currentToken);
491 currentToken = result.token;
492 index = result.index;
496 // (...) | \(...\) depending on RE_NO_BK_PARENS
498 else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) {
499 boolean pure = false;
500 boolean comment = false;
501 boolean lookAhead = false;
502 boolean lookBehind = false;
503 boolean independent = false;
504 boolean negativelh = false;
505 boolean negativelb = false;
506 if ((index+1 < pLength) && (pattern[index] == '?')) {
507 switch (pattern[index+1]) {
509 if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
517 if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
524 // We assume that if the syntax supports look-ahead,
525 // it also supports look-behind.
526 if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
528 switch (pattern[index +1]) {
543 // We assume that if the syntax supports look-ahead,
544 // it also supports independent group.
545 if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
558 if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
559 // Set or reset syntax flags.
560 int flagIndex = index + 1;
562 RESyntax newSyntax = new RESyntax(syntax);
563 int newCflags = cflags;
564 boolean negate = false;
565 while (flagIndex < pLength && endFlag < 0) {
566 switch(pattern[flagIndex]) {
569 newCflags &= ~REG_ICASE;
571 newCflags |= REG_ICASE;
576 newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
578 newSyntax.setLineSeparator("\n");
583 newCflags &= ~REG_MULTILINE;
585 newCflags |= REG_MULTILINE;
590 newCflags &= ~REG_DOT_NEWLINE;
592 newCflags |= REG_DOT_NEWLINE;
597 newCflags |= REG_ICASE_USASCII;
599 newCflags &= ~REG_ICASE_USASCII;
604 newCflags &= ~REG_X_COMMENTS;
606 newCflags |= REG_X_COMMENTS;
615 endFlag = pattern[flagIndex];
618 throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
621 if (endFlag == ')') {
624 insens = ((cflags & REG_ICASE) > 0);
625 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
626 // This can be treated as though it were a comment.
628 index = flagIndex - 1;
631 if (endFlag == ':') {
632 savedSyntax = syntax;
633 savedCflags = cflags;
637 insens = ((cflags & REG_ICASE) > 0);
638 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
639 index = flagIndex -1;
640 // Fall through to the next case.
643 throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
646 if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
652 if (syntax.get(RESyntax.RE_COMMENTS)) {
657 throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
661 if (index >= pLength) {
662 throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
665 // find end of subexpression
666 int endIndex = index;
667 int nextIndex = index;
670 while ( ((nextIndex = getCharUnit(pattern,endIndex,unit,false)) > 0)
671 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot))) ) {
672 if ((endIndex = nextIndex) >= pLength)
673 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
674 else if ((unit.ch == '[') && !(unit.bk || quot)) {
675 // I hate to do something similar to the LIST OPERATOR matters
677 int listIndex = nextIndex;
678 if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
679 if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
680 int listEndIndex = -1;
682 while (listIndex < pLength && listEndIndex < 0) {
683 switch(pattern[listIndex++]) {
688 // Sun's API document says that regexp like "[a-d[m-p]]"
689 // is legal. Even something like "[[[^]]]]" is accepted.
691 if (listIndex < pLength && pattern[listIndex] == '^') listIndex++;
692 if (listIndex < pLength && pattern[listIndex] == ']') listIndex++;
696 listEndIndex = listIndex;
701 if (listEndIndex >= 0) {
702 nextIndex = listEndIndex;
703 if ((endIndex = nextIndex) >= pLength)
704 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
708 throw new REException(getLocalizedMessage("subexpr.no.end"),REException.REG_ESUBREG,nextIndex);
710 else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
712 else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))
716 // endIndex is now position at a ')','\)'
717 // nextIndex is end of string or position after ')' or '\)'
719 if (comment) index = nextIndex;
720 else { // not a comment
721 // create RE subexpression as token.
722 addToken(currentToken);
727 int useIndex = (pure || lookAhead || lookBehind || independent) ?
728 0 : nextSub + numSubs;
729 currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
730 numSubs += ((RE) currentToken).getNumSubs();
733 currentToken = new RETokenLookAhead(currentToken,negativelh);
735 else if (lookBehind) {
736 currentToken = new RETokenLookBehind(currentToken,negativelb);
738 else if (independent) {
739 currentToken = new RETokenIndependent(currentToken);
744 syntax = savedSyntax;
745 cflags = savedCflags;
746 insens = ((cflags & REG_ICASE) > 0);
747 insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
753 // UNMATCHED RIGHT PAREN
754 // ) or \) throw exception if
755 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
756 else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ (unit.bk || quot)))) {
757 throw new REException(getLocalizedMessage("unmatched.paren"),REException.REG_EPAREN,index);
760 // START OF LINE OPERATOR
763 else if ((unit.ch == '^') && !(unit.bk || quot)) {
764 addToken(currentToken);
766 RETokenStart token = null;
767 if ((cflags & REG_MULTILINE) > 0) {
768 String sep = syntax.getLineSeparator();
770 token = new RETokenStart(subIndex, null, true);
773 token = new RETokenStart(subIndex, sep);
777 token = new RETokenStart(subIndex, null);
782 // END OF LINE OPERATOR
785 else if ((unit.ch == '$') && !(unit.bk || quot)) {
786 addToken(currentToken);
788 RETokenEnd token = null;
789 if ((cflags & REG_MULTILINE) > 0) {
790 String sep = syntax.getLineSeparator();
792 token = new RETokenEnd(subIndex, null, true);
795 token = new RETokenEnd(subIndex, sep);
799 token = new RETokenEnd(subIndex, null);
804 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
807 else if ((unit.ch == '.') && !(unit.bk || quot)) {
808 addToken(currentToken);
809 currentToken = new RETokenAny(subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
812 // ZERO-OR-MORE REPEAT OPERATOR
815 // This method used to check "repeat.empty.token" to avoid such regexp
816 // as "(a*)*", but now "repeat.empty.token" is allowed.
818 else if ((unit.ch == '*') && !(unit.bk || quot)) {
819 if (currentToken == null)
820 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
821 if (currentToken instanceof RETokenRepeated)
822 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
823 if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
824 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
825 currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
828 // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
829 // + | \+ depending on RE_BK_PLUS_QM
830 // not available if RE_LIMITED_OPS is set
832 // This method used to check "repeat.empty.token" to avoid such regexp
833 // as "(a*)+", but now "repeat.empty.token" is allowed.
835 else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
836 if (currentToken == null)
837 throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
839 // Check for possessive matching on RETokenRepeated
840 if (currentToken instanceof RETokenRepeated) {
841 RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
842 if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
843 tokenRep.makePossessive();
845 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
848 else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
849 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
851 currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
854 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
855 // ? | \? depending on RE_BK_PLUS_QM
856 // not available if RE_LIMITED_OPS is set
857 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
859 else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
860 if (currentToken == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
862 // Check for stingy matching on RETokenRepeated
863 if (currentToken instanceof RETokenRepeated) {
864 RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
865 if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
866 tokenRep.makeStingy();
868 throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
870 else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
871 throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
873 currentToken = setRepeated(currentToken,0,1,index);
879 else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
880 CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
882 throw new REException("invalid octal character", REException.REG_ESCAPE, index);
883 index = index - 2 + ce.len;
884 addToken(currentToken);
885 currentToken = new RETokenChar(subIndex,ce.ch,insens);
886 if (insensUSASCII) currentToken.unicodeAware = false;
889 // BACKREFERENCE OPERATOR
890 // \1 \2 ... \9 and \10 \11 \12 ...
891 // not available if RE_NO_BK_REFS is set
892 // Perl recognizes \10, \11, and so on only if enough number of
893 // parentheses have opened before it, otherwise they are treated
894 // as aliases of \010, \011, ... (octal characters). In case of
895 // Sun's JDK, octal character expression must always begin with \0.
896 // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
897 // JDK treats \2 as a back reference to the 2nd group because
898 // there are only two groups. But in our poor implementation,
899 // we cannot help but treat \29 as a back reference to the 29th group.
901 else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
902 addToken(currentToken);
903 int numBegin = index - 1;
904 int numEnd = pLength;
905 for (int i = index; i < pLength; i++) {
906 if (! Character.isDigit(pattern[i])) {
911 int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
913 currentToken = new RETokenBackRef(subIndex,num,insens);
914 if (insensUSASCII) currentToken.unicodeAware = false;
918 // START OF STRING OPERATOR
919 // \A if RE_STRING_ANCHORS is set
921 else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
922 addToken(currentToken);
923 currentToken = new RETokenStart(subIndex,null);
926 // WORD BREAK OPERATOR
929 else if (unit.bk && (unit.ch == 'b') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
930 addToken(currentToken);
931 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, false);
934 // WORD BEGIN OPERATOR
936 else if (unit.bk && (unit.ch == '<')) {
937 addToken(currentToken);
938 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN, false);
943 else if (unit.bk && (unit.ch == '>')) {
944 addToken(currentToken);
945 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.END, false);
948 // NON-WORD BREAK OPERATOR
951 else if (unit.bk && (unit.ch == 'B') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
952 addToken(currentToken);
953 currentToken = new RETokenWordBoundary(subIndex, RETokenWordBoundary.BEGIN | RETokenWordBoundary.END, true);
958 // \d if RE_CHAR_CLASS_ESCAPES is set
960 else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
961 addToken(currentToken);
962 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
963 if (insensUSASCII) currentToken.unicodeAware = false;
966 // NON-DIGIT OPERATOR
969 else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
970 addToken(currentToken);
971 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
972 if (insensUSASCII) currentToken.unicodeAware = false;
978 else if (unit.bk && (unit.ch == 'n')) {
979 addToken(currentToken);
980 currentToken = new RETokenChar(subIndex,'\n',false);
986 else if (unit.bk && (unit.ch == 'r')) {
987 addToken(currentToken);
988 currentToken = new RETokenChar(subIndex,'\r',false);
991 // WHITESPACE OPERATOR
992 // \s if RE_CHAR_CLASS_ESCAPES is set
994 else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
995 addToken(currentToken);
996 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
997 if (insensUSASCII) currentToken.unicodeAware = false;
1000 // NON-WHITESPACE OPERATOR
1003 else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1004 addToken(currentToken);
1005 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
1006 if (insensUSASCII) currentToken.unicodeAware = false;
1012 else if (unit.bk && (unit.ch == 't')) {
1013 addToken(currentToken);
1014 currentToken = new RETokenChar(subIndex,'\t',false);
1017 // ALPHANUMERIC OPERATOR
1020 else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1021 addToken(currentToken);
1022 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
1023 if (insensUSASCII) currentToken.unicodeAware = false;
1026 // NON-ALPHANUMERIC OPERATOR
1029 else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
1030 addToken(currentToken);
1031 currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
1032 if (insensUSASCII) currentToken.unicodeAware = false;
1035 // END OF STRING OPERATOR
1038 // FIXME: \Z and \z are different in that if the input string
1039 // ends with a line terminator, \Z matches the position before
1040 // the final terminator. This special behavior of \Z is yet
1041 // to be implemented.
1043 else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') &&
1044 syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1045 addToken(currentToken);
1046 currentToken = new RETokenEnd(subIndex,null);
1049 // HEX CHARACTER, UNICODE CHARACTER
1052 else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1053 (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1054 CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
1056 throw new REException("invalid hex character", REException.REG_ESCAPE, index);
1057 index = index - 2 + ce.len;
1058 addToken(currentToken);
1059 currentToken = new RETokenChar(subIndex,ce.ch,insens);
1060 if (insensUSASCII) currentToken.unicodeAware = false;
1064 // \p{prop}, \P{prop}
1066 else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
1067 (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
1068 NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
1070 throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1071 index = index - 2 + np.len;
1072 addToken(currentToken);
1073 currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
1074 if (insensUSASCII) currentToken.unicodeAware = false;
1077 // END OF PREVIOUS MATCH
1080 else if (unit.bk && (unit.ch == 'G') &&
1081 syntax.get(RESyntax.RE_STRING_ANCHORS)) {
1082 addToken(currentToken);
1083 currentToken = new RETokenEndOfPreviousMatch(subIndex);
1086 // NON-SPECIAL CHARACTER (or escape to make literal)
1087 // c | \* for example
1089 else { // not a special character
1090 addToken(currentToken);
1091 currentToken = new RETokenChar(subIndex,unit.ch,insens);
1092 if (insensUSASCII) currentToken.unicodeAware = false;
1096 // Add final buffered token and an EndSub marker
1097 addToken(currentToken);
1099 if (branches != null) {
1100 branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
1101 branches.trimToSize(); // compact the Vector
1104 firstToken = lastToken = null;
1105 addToken(new RETokenOneOf(subIndex,branches,false));
1107 else addToken(new RETokenEndSub(subIndex));
1111 private static class ParseCharClassResult {
1114 boolean returnAtAndOperator = false;
1118 * Parse [...] or [^...] and make an RETokenOneOf instance.
1119 * @param subIndex subIndex to be given to the created RETokenOneOf instance.
1120 * @param pattern Input array of characters to be parsed.
1121 * @param index Index pointing to the character next to the beginning '['.
1122 * @param pLength Limit of the input array.
1123 * @param cflags Compilation flags used to parse the pattern.
1124 * @param pflags Flags that affect the behavior of this method.
1125 * @param syntax Syntax used to parse the pattern.
1127 private static ParseCharClassResult parseCharClass(int subIndex,
1128 char[] pattern, int index,
1129 int pLength, int cflags, RESyntax syntax, int pflags)
1130 throws REException {
1132 boolean insens = ((cflags & REG_ICASE) > 0);
1133 boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
1134 Vector options = new Vector();
1135 Vector addition = new Vector();
1136 boolean additionAndAppeared = false;
1137 final int RETURN_AT_AND = 0x01;
1138 boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
1139 boolean negative = false;
1143 boolean lastCharIsSet = false;
1144 if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
1146 // Check for initial caret, negation
1147 if ((ch = pattern[index]) == '^') {
1149 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1150 ch = pattern[index];
1153 // Check for leading right bracket literal
1155 lastChar = ch; lastCharIsSet = true;
1156 if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1159 while ((ch = pattern[index++]) != ']') {
1160 if ((ch == '-') && (lastCharIsSet)) {
1161 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1162 if ((ch = pattern[index]) == ']') {
1163 RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1164 if (insensUSASCII) t.unicodeAware = false;
1165 options.addElement(t);
1168 if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1169 CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
1171 throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1173 index = index + ce.len - 1;
1175 RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
1176 if (insensUSASCII) t.unicodeAware = false;
1177 options.addElement(t);
1178 lastChar = 0; lastCharIsSet = false;
1181 } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
1182 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1184 boolean negate = false;
1186 boolean asciiEscIsSet = false;
1187 NamedProperty np = null;
1188 if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
1189 switch (pattern[index]) {
1193 posixID = RETokenPOSIX.DIGIT;
1198 posixID = RETokenPOSIX.SPACE;
1203 posixID = RETokenPOSIX.ALNUM;
1207 if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
1208 np = getNamedProperty(pattern, index - 1, pLength);
1210 throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1211 index = index - 1 + np.len - 1;
1214 CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
1216 throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
1217 asciiEsc = ce.ch; asciiEscIsSet = true;
1218 index = index - 1 + ce.len - 1;
1220 if (lastCharIsSet) {
1221 RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1222 if (insensUSASCII) t.unicodeAware = false;
1223 options.addElement(t);
1226 if (posixID != -1) {
1227 RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
1228 if (insensUSASCII) t.unicodeAware = false;
1229 options.addElement(t);
1230 } else if (np != null) {
1231 RETokenNamedProperty t = getRETokenNamedProperty(subIndex,np,insens,index);
1232 if (insensUSASCII) t.unicodeAware = false;
1233 options.addElement(t);
1234 } else if (asciiEscIsSet) {
1235 lastChar = asciiEsc; lastCharIsSet = true;
1237 lastChar = pattern[index]; lastCharIsSet = true;
1240 } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
1241 StringBuffer posixSet = new StringBuffer();
1242 index = getPosixSet(pattern,index+1,posixSet);
1243 int posixId = RETokenPOSIX.intValue(posixSet.toString());
1244 if (posixId != -1) {
1245 RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
1246 if (insensUSASCII) t.unicodeAware = false;
1247 options.addElement(t);
1249 } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
1250 ParseCharClassResult result = parseCharClass(
1251 subIndex, pattern, index, pLength, cflags, syntax, 0);
1252 addition.addElement(result.token);
1253 addition.addElement("|");
1254 index = result.index;
1255 } else if ((ch == '&') &&
1256 (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
1257 (index < pLength) && (pattern[index] == '&')) {
1258 if (returnAtAndOperator) {
1259 ParseCharClassResult result = new ParseCharClassResult();
1260 options.trimToSize();
1261 if (additionAndAppeared) addition.addElement("&");
1262 if (addition.size() == 0) addition = null;
1263 result.token = new RETokenOneOf(subIndex,
1264 options, addition, negative);
1265 result.index = index - 1;
1266 result.returnAtAndOperator = true;
1269 // The precedence of the operator "&&" is the lowest.
1270 // So we postpone adding "&" until other elements
1271 // are added. And we insert Boolean.FALSE at the
1272 // beginning of the list of tokens following "&&".
1273 // So, "&&[a-b][k-m]" will be stored in the Vecter
1274 // addition in this order:
1275 // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
1276 if (additionAndAppeared) addition.addElement("&");
1277 addition.addElement(Boolean.FALSE);
1278 additionAndAppeared = true;
1280 // The part on which "&&" operates may be either
1281 // (1) explicitly enclosed by []
1283 // (2) not enclosed by [] and terminated by the
1284 // next "&&" or the end of the character list.
1285 // Let the preceding else if block do the case (1).
1286 // We must do something in case of (2).
1287 if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
1288 ParseCharClassResult result = parseCharClass(
1289 subIndex, pattern, index+1, pLength, cflags, syntax,
1291 addition.addElement(result.token);
1292 addition.addElement("|");
1293 // If the method returned at the next "&&", it is OK.
1294 // Otherwise we have eaten the mark of the end of this
1295 // character list "]". In this case we must give back
1297 index = (result.returnAtAndOperator ?
1298 result.index: result.index - 1);
1301 if (lastCharIsSet) {
1302 RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1303 if (insensUSASCII) t.unicodeAware = false;
1304 options.addElement(t);
1306 lastChar = ch; lastCharIsSet = true;
1308 if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
1310 // Out of list, index is one past ']'
1312 if (lastCharIsSet) {
1313 RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
1314 if (insensUSASCII) t.unicodeAware = false;
1315 options.addElement(t);
1318 ParseCharClassResult result = new ParseCharClassResult();
1319 // Create a new RETokenOneOf
1320 options.trimToSize();
1321 if (additionAndAppeared) addition.addElement("&");
1322 if (addition.size() == 0) addition = null;
1323 result.token = new RETokenOneOf(subIndex,options, addition, negative);
1324 result.index = index;
1328 private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
1329 unit.ch = input[index++];
1330 unit.bk = (unit.ch == '\\'
1331 && (!quot || index >= input.length || input[index] == 'E'));
1333 if (index < input.length)
1334 unit.ch = input[index++];
1335 else throw new REException(getLocalizedMessage("ends.with.backslash"),REException.REG_ESCAPE,index);
1339 private static int parseInt(char[] input, int pos, int len, int radix) {
1341 for (int i = pos; i < pos + len; i++) {
1342 ret = ret * radix + Character.digit(input[i], radix);
1348 * This class represents various expressions for a character.
1350 * "\0123" : Octal char 0123
1351 * "\x1b" : Hex char 0x1b
1352 * "\u1234" : Unicode char \u1234
1354 private static class CharExpression {
1355 /** character represented by this expression */
1357 /** String expression */
1359 /** length of this expression */
1361 public String toString() { return expr; }
1364 private static CharExpression getCharExpression(char[] input, int pos, int lim,
1366 CharExpression ce = new CharExpression();
1367 char c = input[pos];
1369 if (pos + 1 >= lim) return null;
1386 if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
1387 (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
1389 int expectedLength = (c == 'x' ? 2 : 4);
1390 for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
1391 if (i >= lim) break;
1392 if (!((input[i] >= '0' && input[i] <= '9') ||
1393 (input[i] >= 'A' && input[i] <= 'F') ||
1394 (input[i] >= 'a' && input[i] <= 'f')))
1398 if (l != expectedLength) return null;
1399 ce.ch = (char)(parseInt(input, pos + 2, l, 16));
1408 if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
1410 for (int i = pos + 2; i < pos + 2 + 3; i++) {
1411 if (i >= lim) break;
1412 if (input[i] < '0' || input[i] > '7') break;
1415 if (l == 3 && input[pos + 2] > '3') l--;
1416 if (l <= 0) return null;
1417 ce.ch = (char)(parseInt(input, pos + 2, l, 8));
1435 ce.expr = new String(input, pos, ce.len);
1440 * This class represents a substring in a pattern string expressing
1442 * "\pA" : Property named "A"
1443 * "\p{prop}" : Property named "prop"
1444 * "\PA" : Property named "A" (Negated)
1445 * "\P{prop}" : Property named "prop" (Negated)
1447 private static class NamedProperty {
1448 /** Property name */
1450 /** Negated or not */
1452 /** length of this expression */
1456 private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
1457 NamedProperty np = new NamedProperty();
1458 char c = input[pos];
1460 if (++pos >= lim) return null;
1475 for (int i = pos; i < lim; i++) {
1476 if (input[i] == '}') {
1481 if (p < 0) return null;
1483 np.name = new String(input, pos, len);
1487 np.name = new String(input, pos - 1, 1);
1495 private static RETokenNamedProperty getRETokenNamedProperty(
1496 int subIndex, NamedProperty np, boolean insens, int index)
1497 throws REException {
1499 return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
1501 catch (REException e) {
1503 ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
1510 * Checks if the regular expression matches the input in its entirety.
1512 * @param input The input text.
1514 public boolean isMatch(Object input) {
1515 return isMatch(input,0,0);
1519 * Checks if the input string, starting from index, is an exact match of
1520 * this regular expression.
1522 * @param input The input text.
1523 * @param index The offset index at which the search should be begin.
1525 public boolean isMatch(Object input,int index) {
1526 return isMatch(input,index,0);
1531 * Checks if the input, starting from index and using the specified
1532 * execution flags, is an exact match of this regular expression.
1534 * @param input The input text.
1535 * @param index The offset index at which the search should be begin.
1536 * @param eflags The logical OR of any execution flags above.
1538 public boolean isMatch(Object input,int index,int eflags) {
1539 return isMatchImpl(makeCharIndexed(input,index),index,eflags);
1542 private boolean isMatchImpl(CharIndexed input, int index, int eflags) {
1543 if (firstToken == null) // Trivial case
1544 return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS);
1545 REMatch m = new REMatch(numSubs, index, eflags);
1546 if (firstToken.match(input, m)) {
1548 if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) {
1557 * Returns the maximum number of subexpressions in this regular expression.
1558 * If the expression contains branches, the value returned will be the
1559 * maximum subexpressions in any of the branches.
1561 public int getNumSubs() {
1565 // Overrides REToken.setUncle
1566 void setUncle(REToken uncle) {
1567 if (lastToken != null) {
1568 lastToken.setUncle(uncle);
1569 } else super.setUncle(uncle); // to deal with empty subexpressions
1572 // Overrides REToken.chain
1574 boolean chain(REToken next) {
1581 * Returns the minimum number of characters that could possibly
1582 * constitute a match of this regular expression.
1584 public int getMinimumLength() {
1585 return minimumLength;
1588 public int getMaximumLength() {
1589 return maximumLength;
1593 * Returns an array of all matches found in the input.
1595 * If the regular expression allows the empty string to match, it will
1596 * substitute matches at all positions except the end of the input.
1598 * @param input The input text.
1599 * @return a non-null (but possibly zero-length) array of matches
1601 public REMatch[] getAllMatches(Object input) {
1602 return getAllMatches(input,0,0);
1606 * Returns an array of all matches found in the input,
1607 * beginning at the specified index position.
1609 * If the regular expression allows the empty string to match, it will
1610 * substitute matches at all positions except the end of the input.
1612 * @param input The input text.
1613 * @param index The offset index at which the search should be begin.
1614 * @return a non-null (but possibly zero-length) array of matches
1616 public REMatch[] getAllMatches(Object input, int index) {
1617 return getAllMatches(input,index,0);
1621 * Returns an array of all matches found in the input string,
1622 * beginning at the specified index position and using the specified
1625 * If the regular expression allows the empty string to match, it will
1626 * substitute matches at all positions except the end of the input.
1628 * @param input The input text.
1629 * @param index The offset index at which the search should be begin.
1630 * @param eflags The logical OR of any execution flags above.
1631 * @return a non-null (but possibly zero-length) array of matches
1633 public REMatch[] getAllMatches(Object input, int index, int eflags) {
1634 return getAllMatchesImpl(makeCharIndexed(input,index),index,eflags);
1637 // this has been changed since 1.03 to be non-overlapping matches
1638 private REMatch[] getAllMatchesImpl(CharIndexed input, int index, int eflags) {
1639 Vector all = new Vector();
1641 while ((m = getMatchImpl(input,index,eflags,null)) != null) {
1643 index = m.getEndIndex();
1644 if (m.end[0] == 0) { // handle pathological case of zero-length match
1648 input.move(m.end[0]);
1650 if (!input.isValid()) break;
1652 REMatch[] mset = new REMatch[all.size()];
1657 /* Implements abstract method REToken.match() */
1658 boolean match(CharIndexed input, REMatch mymatch) {
1659 input.setHitEnd(mymatch);
1660 if (firstToken == null) {
1661 return next(input, mymatch);
1664 // Note the start of this subexpression
1665 mymatch.start1[subIndex] = mymatch.index;
1667 return firstToken.match(input, mymatch);
1670 REMatch findMatch(CharIndexed input, REMatch mymatch) {
1671 if (mymatch.backtrackStack == null)
1672 mymatch.backtrackStack = new BacktrackStack();
1673 boolean b = match(input, mymatch);
1681 * Returns the first match found in the input. If no match is found,
1684 * @param input The input text.
1685 * @return An REMatch instance referencing the match, or null if none.
1687 public REMatch getMatch(Object input) {
1688 return getMatch(input,0,0);
1692 * Returns the first match found in the input, beginning
1693 * the search at the specified index. If no match is found,
1696 * @param input The input text.
1697 * @param index The offset within the text to begin looking for a match.
1698 * @return An REMatch instance referencing the match, or null if none.
1700 public REMatch getMatch(Object input, int index) {
1701 return getMatch(input,index,0);
1705 * Returns the first match found in the input, beginning
1706 * the search at the specified index, and using the specified
1707 * execution flags. If no match is found, returns null.
1709 * @param input The input text.
1710 * @param index The offset index at which the search should be begin.
1711 * @param eflags The logical OR of any execution flags above.
1712 * @return An REMatch instance referencing the match, or null if none.
1714 public REMatch getMatch(Object input, int index, int eflags) {
1715 return getMatch(input,index,eflags,null);
1719 * Returns the first match found in the input, beginning the search
1720 * at the specified index, and using the specified execution flags.
1721 * If no match is found, returns null. If a StringBuffer is
1722 * provided and is non-null, the contents of the input text from the
1723 * index to the beginning of the match (or to the end of the input,
1724 * if there is no match) are appended to the StringBuffer.
1726 * @param input The input text.
1727 * @param index The offset index at which the search should be begin.
1728 * @param eflags The logical OR of any execution flags above.
1729 * @param buffer The StringBuffer to save pre-match text in.
1730 * @return An REMatch instance referencing the match, or null if none. */
1731 public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
1732 return getMatchImpl(makeCharIndexed(input,index),index,eflags,buffer);
1735 REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
1736 boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
1737 boolean doMove = ((eflags & REG_FIX_STARTING_POSITION) == 0);
1738 RE re = (tryEntireMatch ? (RE) this.clone() : this);
1739 if (tryEntireMatch) {
1740 RETokenEnd reEnd = new RETokenEnd(0, null);
1741 reEnd.setFake(true);
1744 // Create a new REMatch to hold results
1745 REMatch mymatch = new REMatch(numSubs, anchor, eflags);
1747 /* The following potimization is commented out because
1748 the matching should be tried even if the length of
1749 input is obviously too short in order that
1750 java.util.regex.Matcher#hitEnd() may work correctly.
1751 // Optimization: check if anchor + minimumLength > length
1752 if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
1754 if (re.match(input, mymatch)) {
1755 REMatch best = mymatch;
1756 // We assume that the match that coms first is the best.
1757 // And the following "The longer, the better" rule has
1758 // been commented out. The longest is not neccesarily
1759 // the best. For example, "a" out of "aaa" is the best
1762 // Find best match of them all to observe leftmost longest
1763 while ((mymatch = mymatch.next) != null) {
1764 if (mymatch.index > best.index) {
1769 best.end[0] = best.index;
1771 input.setLastMatch(best);
1774 /* End of the optimization commented out
1777 mymatch.clear(++anchor);
1778 // Append character to buffer if needed
1779 if (buffer != null && input.charAt(0) != CharIndexed.OUT_OF_BOUNDS) {
1780 buffer.append(input.charAt(0));
1782 // java.util.regex.Matcher#hitEnd() requires that the search should
1783 // be tried at the end of input, so we use move1(1) instead of move(1)
1784 } while (doMove && input.move1(1));
1786 // Special handling at end of input for e.g. "$"
1787 if (minimumLength == 0) {
1788 if (match(input, mymatch)) {
1789 mymatch.finish(input);
1798 * Returns an REMatchEnumeration that can be used to iterate over the
1799 * matches found in the input text.
1801 * @param input The input text.
1802 * @return A non-null REMatchEnumeration instance.
1804 public REMatchEnumeration getMatchEnumeration(Object input) {
1805 return getMatchEnumeration(input,0,0);
1810 * Returns an REMatchEnumeration that can be used to iterate over the
1811 * matches found in the input text.
1813 * @param input The input text.
1814 * @param index The offset index at which the search should be begin.
1815 * @return A non-null REMatchEnumeration instance, with its input cursor
1816 * set to the index position specified.
1818 public REMatchEnumeration getMatchEnumeration(Object input, int index) {
1819 return getMatchEnumeration(input,index,0);
1823 * Returns an REMatchEnumeration that can be used to iterate over the
1824 * matches found in the input text.
1826 * @param input The input text.
1827 * @param index The offset index at which the search should be begin.
1828 * @param eflags The logical OR of any execution flags above.
1829 * @return A non-null REMatchEnumeration instance, with its input cursor
1830 * set to the index position specified.
1832 public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
1833 return new REMatchEnumeration(this,makeCharIndexed(input,index),index,eflags);
1838 * Substitutes the replacement text for the first match found in the input.
1840 * @param input The input text.
1841 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1842 * @return A String interpolating the substituted text.
1843 * @see REMatch#substituteInto
1845 public String substitute(Object input,String replace) {
1846 return substitute(input,replace,0,0);
1850 * Substitutes the replacement text for the first match found in the input
1851 * beginning at the specified index position. Specifying an index
1852 * effectively causes the regular expression engine to throw away the
1853 * specified number of characters.
1855 * @param input The input text.
1856 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1857 * @param index The offset index at which the search should be begin.
1858 * @return A String containing the substring of the input, starting
1859 * at the index position, and interpolating the substituted text.
1860 * @see REMatch#substituteInto
1862 public String substitute(Object input,String replace,int index) {
1863 return substitute(input,replace,index,0);
1867 * Substitutes the replacement text for the first match found in the input
1868 * string, beginning at the specified index position and using the
1869 * specified execution flags.
1871 * @param input The input text.
1872 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1873 * @param index The offset index at which the search should be begin.
1874 * @param eflags The logical OR of any execution flags above.
1875 * @return A String containing the substring of the input, starting
1876 * at the index position, and interpolating the substituted text.
1877 * @see REMatch#substituteInto
1879 public String substitute(Object input,String replace,int index,int eflags) {
1880 return substituteImpl(makeCharIndexed(input,index),replace,index,eflags);
1883 private String substituteImpl(CharIndexed input,String replace,int index,int eflags) {
1884 StringBuffer buffer = new StringBuffer();
1885 REMatch m = getMatchImpl(input,index,eflags,buffer);
1886 if (m==null) return buffer.toString();
1887 buffer.append(getReplacement(replace, m, eflags));
1888 if (input.move(m.end[0])) {
1890 buffer.append(input.charAt(0));
1891 } while (input.move(1));
1893 return buffer.toString();
1897 * Substitutes the replacement text for each non-overlapping match found
1898 * in the input text.
1900 * @param input The input text.
1901 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1902 * @return A String interpolating the substituted text.
1903 * @see REMatch#substituteInto
1905 public String substituteAll(Object input,String replace) {
1906 return substituteAll(input,replace,0,0);
1910 * Substitutes the replacement text for each non-overlapping match found
1911 * in the input text, starting at the specified index.
1913 * If the regular expression allows the empty string to match, it will
1914 * substitute matches at all positions except the end of the input.
1916 * @param input The input text.
1917 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1918 * @param index The offset index at which the search should be begin.
1919 * @return A String containing the substring of the input, starting
1920 * at the index position, and interpolating the substituted text.
1921 * @see REMatch#substituteInto
1923 public String substituteAll(Object input,String replace,int index) {
1924 return substituteAll(input,replace,index,0);
1928 * Substitutes the replacement text for each non-overlapping match found
1929 * in the input text, starting at the specified index and using the
1930 * specified execution flags.
1932 * @param input The input text.
1933 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1934 * @param index The offset index at which the search should be begin.
1935 * @param eflags The logical OR of any execution flags above.
1936 * @return A String containing the substring of the input, starting
1937 * at the index position, and interpolating the substituted text.
1938 * @see REMatch#substituteInto
1940 public String substituteAll(Object input,String replace,int index,int eflags) {
1941 return substituteAllImpl(makeCharIndexed(input,index),replace,index,eflags);
1944 private String substituteAllImpl(CharIndexed input,String replace,int index,int eflags) {
1945 StringBuffer buffer = new StringBuffer();
1947 while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1948 buffer.append(getReplacement(replace, m, eflags));
1949 index = m.getEndIndex();
1950 if (m.end[0] == 0) {
1951 char ch = input.charAt(0);
1952 if (ch != CharIndexed.OUT_OF_BOUNDS)
1956 input.move(m.end[0]);
1959 if (!input.isValid()) break;
1961 return buffer.toString();
1964 public static String getReplacement(String replace, REMatch m, int eflags) {
1965 if ((eflags & REG_NO_INTERPOLATE) > 0)
1968 if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
1969 StringBuffer sb = new StringBuffer();
1970 int l = replace.length();
1971 for (int i = 0; i < l; i++) {
1972 char c = replace.charAt(i);
1976 // Let StringIndexOutOfBoundsException be thrown.
1977 sb.append(replace.charAt(i));
1981 while (i1 < replace.length() &&
1982 Character.isDigit(replace.charAt(i1))) i1++;
1983 sb.append(m.substituteInto(replace.substring(i, i1)));
1990 return sb.toString();
1993 return m.substituteInto(replace);
1997 /* Helper function for constructor */
1998 private void addToken(REToken next) {
1999 if (next == null) return;
2000 minimumLength += next.getMinimumLength();
2001 int nmax = next.getMaximumLength();
2002 if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
2003 maximumLength += nmax;
2005 maximumLength = Integer.MAX_VALUE;
2007 if (firstToken == null) {
2008 lastToken = firstToken = next;
2010 // if chain returns false, it "rejected" the token due to
2011 // an optimization, and next was combined with lastToken
2012 if (lastToken.chain(next)) {
2018 private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
2019 if (current == null) throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
2020 return new RETokenRepeated(current.subIndex,current,min,max);
2023 private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
2024 // Precondition: pattern[index-1] == ':'
2025 // we will return pos of closing ']'.
2027 for (i=index; i<(pattern.length-1); i++) {
2028 if ((pattern[i] == ':') && (pattern[i+1] == ']'))
2030 buf.append(pattern[i]);
2032 return index; // didn't match up
2035 private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
2036 // Precondition: input[index-1] == '{', minMax != null
2038 boolean mustMatch = !syntax.get(RESyntax.RE_NO_BK_BRACES);
2039 int startIndex = index;
2040 if (index == input.length) {
2042 throw new REException(getLocalizedMessage("unmatched.brace"),REException.REG_EBRACE,index);
2048 CharUnit unit = new CharUnit();
2049 StringBuffer buf = new StringBuffer();
2051 // Read string of digits
2053 index = getCharUnit(input,index,unit,false);
2054 if (Character.isDigit(unit.ch))
2055 buf.append(unit.ch);
2056 } while ((index != input.length) && Character.isDigit(unit.ch));
2058 // Check for {} tomfoolery
2059 if (buf.length() == 0) {
2061 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2066 min = Integer.parseInt(buf.toString());
2068 if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
2070 else if (index == input.length)
2072 throw new REException(getLocalizedMessage("interval.no.end"),REException.REG_EBRACE,index);
2075 else if ((unit.ch == ',') && !unit.bk) {
2076 buf = new StringBuffer();
2077 // Read string of digits
2078 while (((index = getCharUnit(input,index,unit,false)) != input.length) && Character.isDigit(unit.ch))
2079 buf.append(unit.ch);
2081 if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
2083 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2087 // This is the case of {x,}
2088 if (buf.length() == 0) max = Integer.MAX_VALUE;
2089 else max = Integer.parseInt(buf.toString());
2092 throw new REException(getLocalizedMessage("interval.error"),REException.REG_EBRACE,index);
2096 // We know min and max now, and they are valid.
2099 minMax.second = max;
2101 // return the index following the '}'
2106 * Return a human readable form of the compiled regular expression,
2107 * useful for debugging.
2109 public String toString() {
2110 StringBuffer sb = new StringBuffer();
2112 return sb.toString();
2115 void dump(StringBuffer os) {
2116 os.append("(?#startRE subIndex=" + subIndex + ")");
2119 if (firstToken != null)
2120 firstToken.dumpAll(os);
2123 os.append("(?#endRE subIndex=" + subIndex + ")");
2126 // Cast input appropriately or throw exception
2127 // This method was originally a private method, but has been made
2128 // public because java.util.regex.Matcher uses this.
2129 public static CharIndexed makeCharIndexed(Object input, int index) {
2130 // The case where input is already a CharIndexed is supposed
2131 // be the most likely because this is the case with
2132 // java.util.regex.Matcher.
2133 // We could let a String or a CharSequence fall through
2134 // to final input, but since it'a very likely input type,
2135 // we check it first.
2136 if (input instanceof CharIndexed) {
2137 CharIndexed ci = (CharIndexed) input;
2138 ci.setAnchor(index);
2141 else if (input instanceof CharSequence)
2142 return new CharIndexedCharSequence((CharSequence) input,index);
2143 else if (input instanceof String)
2144 return new CharIndexedString((String) input,index);
2145 else if (input instanceof char[])
2146 return new CharIndexedCharArray((char[]) input,index);
2147 else if (input instanceof StringBuffer)
2148 return new CharIndexedStringBuffer((StringBuffer) input,index);
2149 else if (input instanceof InputStream)
2150 return new CharIndexedInputStream((InputStream) input,index);
2152 return new CharIndexedString(input.toString(), index);