ext/scintilla/src/RESearch.cxx

   1 // Scintilla source code edit control\r
   2 /** @file RESearch.cxx\r
   3  ** Regular expression search library.\r
   4  **/\r
   5 \r
   6 /*\r
   7  * regex - Regular expression pattern matching and replacement\r
   8  *\r
   9  * By:  Ozan S. Yigit (oz)\r
  10  *      Dept. of Computer Science\r
  11  *      York University\r
  12  *\r
  13  * Original code available from http://www.cs.yorku.ca/~oz/\r
  14  * Translation to C++ by Neil Hodgson neilh@scintilla.org\r
  15  * Removed all use of register.\r
  16  * Converted to modern function prototypes.\r
  17  * Put all global/static variables into an object so this code can be\r
  18  * used from multiple threads, etc.\r
  19  * Some extensions by Philippe Lhoste PhiLho(a)GMX.net\r
  20  *\r
  21  * These routines are the PUBLIC DOMAIN equivalents of regex\r
  22  * routines as found in 4.nBSD UN*X, with minor extensions.\r
  23  *\r
  24  * These routines are derived from various implementations found\r
  25  * in software tools books, and Conroy's grep. They are NOT derived\r
  26  * from licensed/restricted software.\r
  27  * For more interesting/academic/complicated implementations,\r
  28  * see Henry Spencer's regexp routines, or GNU Emacs pattern\r
  29  * matching module.\r
  30  *\r
  31  * Modification history removed.\r
  32  *\r
  33  * Interfaces:\r
  34  *  RESearch::Compile:      compile a regular expression into a NFA.\r
  35  *\r
  36  *          const char *RESearch::Compile(const char *pattern, int length,\r
  37  *                                        bool caseSensitive, bool posix)\r
  38  *\r
  39  * Returns a short error string if they fail.\r
  40  *\r
  41  *  RESearch::Execute:      execute the NFA to match a pattern.\r
  42  *\r
  43  *          int RESearch::Execute(characterIndexer &ci, int lp, int endp)\r
  44  *\r
  45  *  RESearch::Substitute:   substitute the matched portions in a new string.\r
  46  *\r
  47  *          int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)\r
  48  *\r
  49  *  re_fail:                failure routine for RESearch::Execute. (no longer used)\r
  50  *\r
  51  *          void re_fail(char *msg, char op)\r
  52  *\r
  53  * Regular Expressions:\r
  54  *\r
  55  *      [1]     char    matches itself, unless it is a special\r
  56  *                      character (metachar): . \ [ ] * + ^ $\r
  57  *                      and ( ) if posix option.\r
  58  *\r
  59  *      [2]     .       matches any character.\r
  60  *\r
  61  *      [3]     \       matches the character following it, except:\r
  62  *                      - \a, \b, \f, \n, \r, \t, \v match the corresponding C\r
  63  *                      escape char, respectively BEL, BS, FF, LF, CR, TAB and VT;\r
  64  *                      Note that \r and \n are never matched because Scintilla\r
  65  *                      regex searches are made line per line\r
  66  *                      (stripped of end-of-line chars).\r
  67  *                      - if not in posix mode, when followed by a\r
  68  *                      left or right round bracket (see [7]);\r
  69  *                      - when followed by a digit 1 to 9 (see [8]);\r
  70  *                      - when followed by a left or right angle bracket\r
  71  *                      (see [9]);\r
  72  *                      - when followed by d, D, s, S, w or W (see [10]);\r
  73  *                      - when followed by x and two hexa digits (see [11].\r
  74  *                      Backslash is used as an escape character for all\r
  75  *                      other meta-characters, and itself.\r
  76  *\r
  77  *      [4]     [set]   matches one of the characters in the set.\r
  78  *                      If the first character in the set is "^",\r
  79  *                      it matches the characters NOT in the set, i.e.\r
  80  *                      complements the set. A shorthand S-E (start dash end)\r
  81  *                      is used to specify a set of characters S up to\r
  82  *                      E, inclusive. S and E must be characters, otherwise\r
  83  *                      the dash is taken literally (eg. in expression [\d-a]).\r
  84  *                      The special characters "]" and "-" have no special\r
  85  *                      meaning if they appear as the first chars in the set.\r
  86  *                      To include both, put - first: [-]A-Z]\r
  87  *                      (or just backslash them).\r
  88  *                      examples:        match:\r
  89  *\r
  90  *                              [-]|]    matches these 3 chars,\r
  91  *\r
  92  *                              []-|]    matches from ] to | chars\r
  93  *\r
  94  *                              [a-z]    any lowercase alpha\r
  95  *\r
  96  *                              [^-]]    any char except - and ]\r
  97  *\r
  98  *                              [^A-Z]   any char except uppercase\r
  99  *                                       alpha\r
 100  *\r
 101  *                              [a-zA-Z] any alpha\r
 102  *\r
 103  *      [5]     *       any regular expression form [1] to [4]\r
 104  *                      (except [7], [8] and [9] forms of [3]),\r
 105  *                      followed by closure char (*)\r
 106  *                      matches zero or more matches of that form.\r
 107  *\r
 108  *      [6]     +       same as [5], except it matches one or more.\r
 109  *                      Both [5] and [6] are greedy (they match as much as possible).\r
 110  *\r
 111  *      [7]             a regular expression in the form [1] to [12], enclosed\r
 112  *                      as \(form\) (or (form) with posix flag) matches what\r
 113  *                      form matches. The enclosure creates a set of tags,\r
 114  *                      used for [8] and for pattern substitution.\r
 115  *                      The tagged forms are numbered starting from 1.\r
 116  *\r
 117  *      [8]             a \ followed by a digit 1 to 9 matches whatever a\r
 118  *                      previously tagged regular expression ([7]) matched.\r
 119  *\r
 120  *      [9]     \<      a regular expression starting with a \< construct\r
 121  *              \>      and/or ending with a \> construct, restricts the\r
 122  *                      pattern matching to the beginning of a word, and/or\r
 123  *                      the end of a word. A word is defined to be a character\r
 124  *                      string beginning and/or ending with the characters\r
 125  *                      A-Z a-z 0-9 and _. Scintilla extends this definition\r
 126  *                      by user setting. The word must also be preceded and/or\r
 127  *                      followed by any character outside those mentioned.\r
 128  *\r
 129  *      [10]    \l      a backslash followed by d, D, s, S, w or W,\r
 130  *                      becomes a character class (both inside and\r
 131  *                      outside sets []).\r
 132  *                        d: decimal digits\r
 133  *                        D: any char except decimal digits\r
 134  *                        s: whitespace (space, \t \n \r \f \v)\r
 135  *                        S: any char except whitespace (see above)\r
 136  *                        w: alphanumeric & underscore (changed by user setting)\r
 137  *                        W: any char except alphanumeric & underscore (see above)\r
 138  *\r
 139  *      [11]    \xHH    a backslash followed by x and two hexa digits,\r
 140  *                      becomes the character whose Ascii code is equal\r
 141  *                      to these digits. If not followed by two digits,\r
 142  *                      it is 'x' char itself.\r
 143  *\r
 144  *      [12]            a composite regular expression xy where x and y\r
 145  *                      are in the form [1] to [11] matches the longest\r
 146  *                      match of x followed by a match for y.\r
 147  *\r
 148  *      [13]    ^       a regular expression starting with a ^ character\r
 149  *              $       and/or ending with a $ character, restricts the\r
 150  *                      pattern matching to the beginning of the line,\r
 151  *                      or the end of line. [anchors] Elsewhere in the\r
 152  *                      pattern, ^ and $ are treated as ordinary characters.\r
 153  *\r
 154  *\r
 155  * Acknowledgements:\r
 156  *\r
 157  *  HCR's Hugh Redelmeier has been most helpful in various\r
 158  *  stages of development. He convinced me to include BOW\r
 159  *  and EOW constructs, originally invented by Rob Pike at\r
 160  *  the University of Toronto.\r
 161  *\r
 162  * References:\r
 163  *              Software tools                  Kernighan & Plauger\r
 164  *              Software tools in Pascal        Kernighan & Plauger\r
 165  *              Grep [rsx-11 C dist]            David Conroy\r
 166  *              ed - text editor                Un*x Programmer's Manual\r
 167  *              Advanced editing on Un*x        B. W. Kernighan\r
 168  *              RegExp routines                 Henry Spencer\r
 169  *\r
 170  * Notes:\r
 171  *\r
 172  *  This implementation uses a bit-set representation for character\r
 173  *  classes for speed and compactness. Each character is represented\r
 174  *  by one bit in a 256-bit block. Thus, CCL always takes a\r
 175  *      constant 32 bytes in the internal nfa, and RESearch::Execute does a single\r
 176  *  bit comparison to locate the character in the set.\r
 177  *\r
 178  * Examples:\r
 179  *\r
 180  *  pattern:    foo*.*\r
 181  *  compile:    CHR f CHR o CLO CHR o END CLO ANY END END\r
 182  *  matches:    fo foo fooo foobar fobar foxx ...\r
 183  *\r
 184  *  pattern:    fo[ob]a[rz]\r
 185  *  compile:    CHR f CHR o CCL bitset CHR a CCL bitset END\r
 186  *  matches:    fobar fooar fobaz fooaz\r
 187  *\r
 188  *  pattern:    foo\\+\r
 189  *  compile:    CHR f CHR o CHR o CHR \ CLO CHR \ END END\r
 190  *  matches:    foo\ foo\\ foo\\\  ...\r
 191  *\r
 192  *  pattern:    \(foo\)[1-3]\1  (same as foo[1-3]foo)\r
 193  *  compile:    BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END\r
 194  *  matches:    foo1foo foo2foo foo3foo\r
 195  *\r
 196  *  pattern:    \(fo.*\)-\1\r
 197  *  compile:    BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END\r
 198  *  matches:    foo-foo fo-fo fob-fob foobar-foobar ...\r
 199  */\r
 200 \r
 201 #include "CharClassify.h"\r
 202 #include "RESearch.h"\r
 203 \r
 204 // Shut up annoying Visual C++ warnings:\r
 205 #ifdef _MSC_VER\r
 206 #pragma warning(disable: 4514)\r
 207 #endif\r
 208 \r
 209 #ifdef SCI_NAMESPACE\r
 210 using namespace Scintilla;\r
 211 #endif\r
 212 \r
 213 #define OKP     1\r
 214 #define NOP     0\r
 215 \r
 216 #define CHR     1\r
 217 #define ANY     2\r
 218 #define CCL     3\r
 219 #define BOL     4\r
 220 #define EOL     5\r
 221 #define BOT     6\r
 222 #define EOT     7\r
 223 #define BOW     8\r
 224 #define EOW     9\r
 225 #define REF     10\r
 226 #define CLO     11\r
 227 \r
 228 #define END     0\r
 229 \r
 230 /*\r
 231  * The following defines are not meant to be changeable.\r
 232  * They are for readability only.\r
 233  */\r
 234 #define BLKIND  0370\r
 235 #define BITIND  07\r
 236 \r
 237 const char bitarr[] = { 1, 2, 4, 8, 16, 32, 64, '\200' };\r
 238 \r
 239 #define badpat(x)       (*nfa = END, x)\r
 240 \r
 241 /*\r
 242  * Character classification table for word boundary operators BOW\r
 243  * and EOW is passed in by the creator of this object (Scintilla\r
 244  * Document). The Document default state is that word chars are:\r
 245  * 0-9, a-z, A-Z and _\r
 246  */\r
 247 \r
 248 RESearch::RESearch(CharClassify *charClassTable) {\r
 249         charClass = charClassTable;\r
 250         Init();\r
 251 }\r
 252 \r
 253 RESearch::~RESearch() {\r
 254         Clear();\r
 255 }\r
 256 \r
 257 void RESearch::Init() {\r
 258         sta = NOP;                  /* status of lastpat */\r
 259         bol = 0;\r
 260         for (int i = 0; i < MAXTAG; i++)\r
 261                 pat[i] = 0;\r
 262         for (int j = 0; j < BITBLK; j++)\r
 263                 bittab[j] = 0;\r
 264 }\r
 265 \r
 266 void RESearch::Clear() {\r
 267         for (int i = 0; i < MAXTAG; i++) {\r
 268                 delete []pat[i];\r
 269                 pat[i] = 0;\r
 270                 bopat[i] = NOTFOUND;\r
 271                 eopat[i] = NOTFOUND;\r
 272         }\r
 273 }\r
 274 \r
 275 bool RESearch::GrabMatches(CharacterIndexer &ci) {\r
 276         bool success = true;\r
 277         for (unsigned int i = 0; i < MAXTAG; i++) {\r
 278                 if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {\r
 279                         unsigned int len = eopat[i] - bopat[i];\r
 280                         pat[i] = new char[len + 1];\r
 281                         if (pat[i]) {\r
 282                                 for (unsigned int j = 0; j < len; j++)\r
 283                                         pat[i][j] = ci.CharAt(bopat[i] + j);\r
 284                                 pat[i][len] = '\0';\r
 285                         } else {\r
 286                                 success = false;\r
 287                         }\r
 288                 }\r
 289         }\r
 290         return success;\r
 291 }\r
 292 \r
 293 void RESearch::ChSet(unsigned char c) {\r
 294         bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];\r
 295 }\r
 296 \r
 297 void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) {\r
 298         if (caseSensitive) {\r
 299                 ChSet(c);\r
 300         } else {\r
 301                 if ((c >= 'a') && (c <= 'z')) {\r
 302                         ChSet(c);\r
 303                         ChSet(static_cast<unsigned char>(c - 'a' + 'A'));\r
 304                 } else if ((c >= 'A') && (c <= 'Z')) {\r
 305                         ChSet(c);\r
 306                         ChSet(static_cast<unsigned char>(c - 'A' + 'a'));\r
 307                 } else {\r
 308                         ChSet(c);\r
 309                 }\r
 310         }\r
 311 }\r
 312 \r
 313 const unsigned char escapeValue(unsigned char ch) {\r
 314         switch (ch) {\r
 315         case 'a':       return '\a';\r
 316         case 'b':       return '\b';\r
 317         case 'f':       return '\f';\r
 318         case 'n':       return '\n';\r
 319         case 'r':       return '\r';\r
 320         case 't':       return '\t';\r
 321         case 'v':       return '\v';\r
 322         }\r
 323         return 0;\r
 324 }\r
 325 \r
 326 static int GetHexaChar(unsigned char hd1, unsigned char hd2) {\r
 327         int hexValue = 0;\r
 328         if (hd1 >= '0' && hd1 <= '9') {\r
 329                 hexValue += 16 * (hd1 - '0');\r
 330         } else if (hd1 >= 'A' && hd1 <= 'F') {\r
 331                 hexValue += 16 * (hd1 - 'A' + 10);\r
 332         } else if (hd1 >= 'a' && hd1 <= 'f') {\r
 333                 hexValue += 16 * (hd1 - 'a' + 10);\r
 334         } else\r
 335                 return -1;\r
 336         if (hd2 >= '0' && hd2 <= '9') {\r
 337                 hexValue += hd2 - '0';\r
 338         } else if (hd2 >= 'A' && hd2 <= 'F') {\r
 339                 hexValue += hd2 - 'A' + 10;\r
 340         } else if (hd2 >= 'a' && hd2 <= 'f') {\r
 341                 hexValue += hd2 - 'a' + 10;\r
 342         } else\r
 343                 return -1;\r
 344         return hexValue;\r
 345 }\r
 346 \r
 347 /**\r
 348  * Called when the parser finds a backslash not followed\r
 349  * by a valid expression (like \( in non-Posix mode).\r
 350  * @param pattern: pointer on the char after the backslash.\r
 351  * @param incr: (out) number of chars to skip after expression evaluation.\r
 352  * @return the char if it resolves to a simple char,\r
 353  * or -1 for a char class. In this case, bittab is changed.\r
 354  */\r
 355 int RESearch::GetBackslashExpression(\r
 356                 const char *pattern,\r
 357                 int &incr) {\r
 358         // Since error reporting is primitive and messages are not used anyway,\r
 359         // I choose to interpret unexpected syntax in a logical way instead\r
 360         // of reporting errors. Otherwise, we can stick on, eg., PCRE behavior.\r
 361         incr = 0;       // Most of the time, will skip the char "naturally".\r
 362         int c;\r
 363         int result = -1;\r
 364         unsigned char bsc = *pattern;\r
 365         if (!bsc) {\r
 366                 // Avoid overrun\r
 367                 result = '\\';  // \ at end of pattern, take it literally\r
 368                 return result;\r
 369         }\r
 370 \r
 371         switch (bsc) {\r
 372         case 'a':\r
 373         case 'b':\r
 374         case 'n':\r
 375         case 'f':\r
 376         case 'r':\r
 377         case 't':\r
 378         case 'v':\r
 379                 result = escapeValue(bsc);\r
 380                 break;\r
 381         case 'x': {\r
 382                         unsigned char hd1 = *(pattern + 1);\r
 383                         unsigned char hd2 = *(pattern + 2);\r
 384                         int hexValue = GetHexaChar(hd1, hd2);\r
 385                         if (hexValue >= 0) {\r
 386                                 result = hexValue;\r
 387                                 incr = 2;       // Must skip the digits\r
 388                         } else {\r
 389                                 result = 'x';   // \x without 2 digits: see it as 'x'\r
 390                         }\r
 391                 }\r
 392                 break;\r
 393         case 'd':\r
 394                 for (c = '0'; c <= '9'; c++) {\r
 395                         ChSet(static_cast<unsigned char>(c));\r
 396                 }\r
 397                 break;\r
 398         case 'D':\r
 399                 for (c = 0; c < MAXCHR; c++) {\r
 400                         if (c < '0' || c > '9') {\r
 401                                 ChSet(static_cast<unsigned char>(c));\r
 402                         }\r
 403                 }\r
 404                 break;\r
 405         case 's':\r
 406                 ChSet(' ');\r
 407                 ChSet('\t');\r
 408                 ChSet('\n');\r
 409                 ChSet('\r');\r
 410                 ChSet('\f');\r
 411                 ChSet('\v');\r
 412                 break;\r
 413         case 'S':\r
 414                 for (c = 0; c < MAXCHR; c++) {\r
 415                         if (c != ' ' && !(c >= 0x09 && c <= 0x0D)) {\r
 416                                 ChSet(static_cast<unsigned char>(c));\r
 417                         }\r
 418                 }\r
 419         case 'w':\r
 420                 for (c = 0; c < MAXCHR; c++) {\r
 421                         if (iswordc(static_cast<unsigned char>(c))) {\r
 422                                 ChSet(static_cast<unsigned char>(c));\r
 423                         }\r
 424                 }\r
 425                 break;\r
 426         case 'W':\r
 427                 for (c = 0; c < MAXCHR; c++) {\r
 428                         if (!iswordc(static_cast<unsigned char>(c))) {\r
 429                                 ChSet(static_cast<unsigned char>(c));\r
 430                         }\r
 431                 }\r
 432                 break;\r
 433         default:\r
 434                 result = bsc;\r
 435         }\r
 436         return result;\r
 437 }\r
 438 \r
 439 const char *RESearch::Compile(const char *pattern, int length, bool caseSensitive, bool posix) {\r
 440         char *mp=nfa;          /* nfa pointer       */\r
 441         char *lp;              /* saved pointer     */\r
 442         char *sp=nfa;          /* another one       */\r
 443         char *mpMax = mp + MAXNFA - BITBLK - 10;\r
 444 \r
 445         int tagi = 0;          /* tag stack index   */\r
 446         int tagc = 1;          /* actual tag count  */\r
 447 \r
 448         int n;\r
 449         char mask;             /* xor mask -CCL/NCL */\r
 450         int c1, c2, prevChar;\r
 451 \r
 452         if (!pattern || !length)\r
 453                 if (sta)\r
 454                         return 0;\r
 455                 else\r
 456                         return badpat("No previous regular expression");\r
 457         sta = NOP;\r
 458 \r
 459         const char *p=pattern;     /* pattern pointer   */\r
 460         for (int i=0; i<length; i++, p++) {\r
 461                 if (mp > mpMax)\r
 462                         return badpat("Pattern too long");\r
 463                 lp = mp;\r
 464                 switch (*p) {\r
 465 \r
 466                 case '.':               /* match any char  */\r
 467                         *mp++ = ANY;\r
 468                         break;\r
 469 \r
 470                 case '^':               /* match beginning */\r
 471                         if (p == pattern)\r
 472                                 *mp++ = BOL;\r
 473                         else {\r
 474                                 *mp++ = CHR;\r
 475                                 *mp++ = *p;\r
 476                         }\r
 477                         break;\r
 478 \r
 479                 case '$':               /* match endofline */\r
 480                         if (!*(p+1))\r
 481                                 *mp++ = EOL;\r
 482                         else {\r
 483                                 *mp++ = CHR;\r
 484                                 *mp++ = *p;\r
 485                         }\r
 486                         break;\r
 487 \r
 488                 case '[':               /* match char class */\r
 489                         *mp++ = CCL;\r
 490                         prevChar = 0;\r
 491 \r
 492                         i++;\r
 493                         if (*++p == '^') {\r
 494                                 mask = '\377';\r
 495                                 i++;\r
 496                                 p++;\r
 497                         } else\r
 498                                 mask = 0;\r
 499 \r
 500                         if (*p == '-') {        /* real dash */\r
 501                                 i++;\r
 502                                 prevChar = *p;\r
 503                                 ChSet(*p++);\r
 504                         }\r
 505                         if (*p == ']') {        /* real brace */\r
 506                                 i++;\r
 507                                 prevChar = *p;\r
 508                                 ChSet(*p++);\r
 509                         }\r
 510                         while (*p && *p != ']') {\r
 511                                 if (*p == '-') {\r
 512                                         if (prevChar < 0) {\r
 513                                                 // Previous def. was a char class like \d, take dash literally\r
 514                                                 prevChar = *p;\r
 515                                                 ChSet(*p);\r
 516                                         } else if (*(p+1)) {\r
 517                                                 if (*(p+1) != ']') {\r
 518                                                         c1 = prevChar + 1;\r
 519                                                         i++;\r
 520                                                         c2 = *++p;\r
 521                                                         if (c2 == '\\') {\r
 522                                                                 if (!*(p+1))    // End of RE\r
 523                                                                         return badpat("Missing ]");\r
 524                                                                 else {\r
 525                                                                         i++;\r
 526                                                                         p++;\r
 527                                                                         int incr;\r
 528                                                                         c2 = GetBackslashExpression(p, incr);\r
 529                                                                         i += incr;\r
 530                                                                         p += incr;\r
 531                                                                         if (c2 >= 0) {\r
 532                                                                                 // Convention: \c (c is any char) is case sensitive, whatever the option\r
 533                                                                                 ChSet(static_cast<unsigned char>(c2));\r
 534                                                                                 prevChar = c2;\r
 535                                                                         } else {\r
 536                                                                                 // bittab is already changed\r
 537                                                                                 prevChar = -1;\r
 538                                                                         }\r
 539                                                                 }\r
 540                                                         }\r
 541                                                         if (prevChar < 0) {\r
 542                                                                 // Char after dash is char class like \d, take dash literally\r
 543                                                                 prevChar = '-';\r
 544                                                                 ChSet('-');\r
 545                                                         } else {\r
 546                                                                 // Put all chars between c1 and c2 included in the char set\r
 547                                                                 while (c1 <= c2) {\r
 548                                                                         ChSetWithCase(static_cast<unsigned char>(c1++), caseSensitive);\r
 549                                                                 }\r
 550                                                         }\r
 551                                                 } else {\r
 552                                                         // Dash before the ], take it literally\r
 553                                                         prevChar = *p;\r
 554                                                         ChSet(*p);\r
 555                                                 }\r
 556                                         } else {\r
 557                                                 return badpat("Missing ]");\r
 558                                         }\r
 559                                 } else if (*p == '\\' && *(p+1)) {\r
 560                                         i++;\r
 561                                         p++;\r
 562                                         int incr;\r
 563                                         int c = GetBackslashExpression(p, incr);\r
 564                                         i += incr;\r
 565                                         p += incr;\r
 566                                         if (c >= 0) {\r
 567                                                 // Convention: \c (c is any char) is case sensitive, whatever the option\r
 568                                                 ChSet(static_cast<unsigned char>(c));\r
 569                                                 prevChar = c;\r
 570                                         } else {\r
 571                                                 // bittab is already changed\r
 572                                                 prevChar = -1;\r
 573                                         }\r
 574                                 } else {\r
 575                                         prevChar = *p;\r
 576                                         ChSetWithCase(*p, caseSensitive);\r
 577                                 }\r
 578                                 i++;\r
 579                                 p++;\r
 580                         }\r
 581                         if (!*p)\r
 582                                 return badpat("Missing ]");\r
 583 \r
 584                         for (n = 0; n < BITBLK; bittab[n++] = 0)\r
 585                                 *mp++ = static_cast<char>(mask ^ bittab[n]);\r
 586 \r
 587                         break;\r
 588 \r
 589                 case '*':               /* match 0 or more... */\r
 590                 case '+':               /* match 1 or more... */\r
 591                         if (p == pattern)\r
 592                                 return badpat("Empty closure");\r
 593                         lp = sp;                /* previous opcode */\r
 594                         if (*lp == CLO)         /* equivalence... */\r
 595                                 break;\r
 596                         switch (*lp) {\r
 597 \r
 598                         case BOL:\r
 599                         case BOT:\r
 600                         case EOT:\r
 601                         case BOW:\r
 602                         case EOW:\r
 603                         case REF:\r
 604                                 return badpat("Illegal closure");\r
 605                         default:\r
 606                                 break;\r
 607                         }\r
 608 \r
 609                         if (*p == '+')\r
 610                                 for (sp = mp; lp < sp; lp++)\r
 611                                         *mp++ = *lp;\r
 612 \r
 613                         *mp++ = END;\r
 614                         *mp++ = END;\r
 615                         sp = mp;\r
 616                         while (--mp > lp)\r
 617                                 *mp = mp[-1];\r
 618                         *mp = CLO;\r
 619                         mp = sp;\r
 620                         break;\r
 621 \r
 622                 case '\\':              /* tags, backrefs... */\r
 623                         i++;\r
 624                         switch (*++p) {\r
 625                         case '<':\r
 626                                 *mp++ = BOW;\r
 627                                 break;\r
 628                         case '>':\r
 629                                 if (*sp == BOW)\r
 630                                         return badpat("Null pattern inside \\<\\>");\r
 631                                 *mp++ = EOW;\r
 632                                 break;\r
 633                         case '1':\r
 634                         case '2':\r
 635                         case '3':\r
 636                         case '4':\r
 637                         case '5':\r
 638                         case '6':\r
 639                         case '7':\r
 640                         case '8':\r
 641                         case '9':\r
 642                                 n = *p-'0';\r
 643                                 if (tagi > 0 && tagstk[tagi] == n)\r
 644                                         return badpat("Cyclical reference");\r
 645                                 if (tagc > n) {\r
 646                                         *mp++ = static_cast<char>(REF);\r
 647                                         *mp++ = static_cast<char>(n);\r
 648                                 } else\r
 649                                         return badpat("Undetermined reference");\r
 650                                 break;\r
 651                         default:\r
 652                                 if (!posix && *p == '(') {\r
 653                                         if (tagc < MAXTAG) {\r
 654                                                 tagstk[++tagi] = tagc;\r
 655                                                 *mp++ = BOT;\r
 656                                                 *mp++ = static_cast<char>(tagc++);\r
 657                                         } else\r
 658                                                 return badpat("Too many \\(\\) pairs");\r
 659                                 } else if (!posix && *p == ')') {\r
 660                                         if (*sp == BOT)\r
 661                                                 return badpat("Null pattern inside \\(\\)");\r
 662                                         if (tagi > 0) {\r
 663                                                 *mp++ = static_cast<char>(EOT);\r
 664                                                 *mp++ = static_cast<char>(tagstk[tagi--]);\r
 665                                         } else\r
 666                                                 return badpat("Unmatched \\)");\r
 667                                 } else {\r
 668                                         int incr;\r
 669                                         int c = GetBackslashExpression(p, incr);\r
 670                                         i += incr;\r
 671                                         p += incr;\r
 672                                         if (c >= 0) {\r
 673                                                 *mp++ = CHR;\r
 674                                                 *mp++ = static_cast<unsigned char>(c);\r
 675                                         } else {\r
 676                                                 *mp++ = CCL;\r
 677                                                 mask = 0;\r
 678                                                 for (n = 0; n < BITBLK; bittab[n++] = 0)\r
 679                                                         *mp++ = static_cast<char>(mask ^ bittab[n]);\r
 680                                         }\r
 681                                 }\r
 682                         }\r
 683                         break;\r
 684 \r
 685                 default :               /* an ordinary char */\r
 686                         if (posix && *p == '(') {\r
 687                                 if (tagc < MAXTAG) {\r
 688                                         tagstk[++tagi] = tagc;\r
 689                                         *mp++ = BOT;\r
 690                                         *mp++ = static_cast<char>(tagc++);\r
 691                                 } else\r
 692                                         return badpat("Too many () pairs");\r
 693                         } else if (posix && *p == ')') {\r
 694                                 if (*sp == BOT)\r
 695                                         return badpat("Null pattern inside ()");\r
 696                                 if (tagi > 0) {\r
 697                                         *mp++ = static_cast<char>(EOT);\r
 698                                         *mp++ = static_cast<char>(tagstk[tagi--]);\r
 699                                 } else\r
 700                                         return badpat("Unmatched )");\r
 701                         } else {\r
 702                                 unsigned char c = *p;\r
 703                                 if (!c) // End of RE\r
 704                                         c = '\\';       // We take it as raw backslash\r
 705                                 if (caseSensitive || !iswordc(c)) {\r
 706                                         *mp++ = CHR;\r
 707                                         *mp++ = c;\r
 708                                 } else {\r
 709                                         *mp++ = CCL;\r
 710                                         mask = 0;\r
 711                                         ChSetWithCase(c, false);\r
 712                                         for (n = 0; n < BITBLK; bittab[n++] = 0)\r
 713                                                 *mp++ = static_cast<char>(mask ^ bittab[n]);\r
 714                                 }\r
 715                         }\r
 716                         break;\r
 717                 }\r
 718                 sp = lp;\r
 719         }\r
 720         if (tagi > 0)\r
 721                 return badpat((posix ? "Unmatched (" : "Unmatched \\("));\r
 722         *mp = END;\r
 723         sta = OKP;\r
 724         return 0;\r
 725 }\r
 726 \r
 727 /*\r
 728  * RESearch::Execute:\r
 729  *   execute nfa to find a match.\r
 730  *\r
 731  *  special cases: (nfa[0])\r
 732  *      BOL\r
 733  *          Match only once, starting from the\r
 734  *          beginning.\r
 735  *      CHR\r
 736  *          First locate the character without\r
 737  *          calling PMatch, and if found, call\r
 738  *          PMatch for the remaining string.\r
 739  *      END\r
 740  *          RESearch::Compile failed, poor luser did not\r
 741  *          check for it. Fail fast.\r
 742  *\r
 743  *  If a match is found, bopat[0] and eopat[0] are set\r
 744  *  to the beginning and the end of the matched fragment,\r
 745  *  respectively.\r
 746  *\r
 747  */\r
 748 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {\r
 749         unsigned char c;\r
 750         int ep = NOTFOUND;\r
 751         char *ap = nfa;\r
 752 \r
 753         bol = lp;\r
 754         failure = 0;\r
 755 \r
 756         Clear();\r
 757 \r
 758         switch (*ap) {\r
 759 \r
 760         case BOL:                       /* anchored: match from BOL only */\r
 761                 ep = PMatch(ci, lp, endp, ap);\r
 762                 break;\r
 763         case EOL:                       /* just searching for end of line normal path doesn't work */\r
 764                 if (*(ap+1) == END) {\r
 765                         lp = endp;\r
 766                         ep = lp;\r
 767                         break;\r
 768                 } else {\r
 769                         return 0;\r
 770                 }\r
 771         case CHR:                       /* ordinary char: locate it fast */\r
 772                 c = *(ap+1);\r
 773                 while ((lp < endp) && (ci.CharAt(lp) != c))\r
 774                         lp++;\r
 775                 if (lp >= endp) /* if EOS, fail, else fall thru. */\r
 776                         return 0;\r
 777         default:                        /* regular matching all the way. */\r
 778                 while (lp < endp) {\r
 779                         ep = PMatch(ci, lp, endp, ap);\r
 780                         if (ep != NOTFOUND)\r
 781                                 break;\r
 782                         lp++;\r
 783                 }\r
 784                 break;\r
 785         case END:                       /* munged automaton. fail always */\r
 786                 return 0;\r
 787         }\r
 788         if (ep == NOTFOUND)\r
 789                 return 0;\r
 790 \r
 791         bopat[0] = lp;\r
 792         eopat[0] = ep;\r
 793         return 1;\r
 794 }\r
 795 \r
 796 /*\r
 797  * PMatch: internal routine for the hard part\r
 798  *\r
 799  *  This code is partly snarfed from an early grep written by\r
 800  *  David Conroy. The backref and tag stuff, and various other\r
 801  *  innovations are by oz.\r
 802  *\r
 803  *  special case optimizations: (nfa[n], nfa[n+1])\r
 804  *      CLO ANY\r
 805  *          We KNOW .* will match everything upto the\r
 806  *          end of line. Thus, directly go to the end of\r
 807  *          line, without recursive PMatch calls. As in\r
 808  *          the other closure cases, the remaining pattern\r
 809  *          must be matched by moving backwards on the\r
 810  *          string recursively, to find a match for xy\r
 811  *          (x is ".*" and y is the remaining pattern)\r
 812  *          where the match satisfies the LONGEST match for\r
 813  *          x followed by a match for y.\r
 814  *      CLO CHR\r
 815  *          We can again scan the string forward for the\r
 816  *          single char and at the point of failure, we\r
 817  *          execute the remaining nfa recursively, same as\r
 818  *          above.\r
 819  *\r
 820  *  At the end of a successful match, bopat[n] and eopat[n]\r
 821  *  are set to the beginning and end of subpatterns matched\r
 822  *  by tagged expressions (n = 1 to 9).\r
 823  */\r
 824 \r
 825 extern void re_fail(char *,char);\r
 826 \r
 827 #define isinset(x,y)    ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])\r
 828 \r
 829 /*\r
 830  * skip values for CLO XXX to skip past the closure\r
 831  */\r
 832 \r
 833 #define ANYSKIP 2       /* [CLO] ANY END          */\r
 834 #define CHRSKIP 3       /* [CLO] CHR chr END      */\r
 835 #define CCLSKIP 34      /* [CLO] CCL 32 bytes END */\r
 836 \r
 837 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {\r
 838         int op, c, n;\r
 839         int e;          /* extra pointer for CLO  */\r
 840         int bp;         /* beginning of subpat... */\r
 841         int ep;         /* ending of subpat...    */\r
 842         int are;        /* to save the line ptr.  */\r
 843 \r
 844         while ((op = *ap++) != END)\r
 845                 switch (op) {\r
 846 \r
 847                 case CHR:\r
 848                         if (ci.CharAt(lp++) != *ap++)\r
 849                                 return NOTFOUND;\r
 850                         break;\r
 851                 case ANY:\r
 852                         if (lp++ >= endp)\r
 853                                 return NOTFOUND;\r
 854                         break;\r
 855                 case CCL:\r
 856                         if (lp >= endp)\r
 857                                 return NOTFOUND;\r
 858                         c = ci.CharAt(lp++);\r
 859                         if (!isinset(ap,c))\r
 860                                 return NOTFOUND;\r
 861                         ap += BITBLK;\r
 862                         break;\r
 863                 case BOL:\r
 864                         if (lp != bol)\r
 865                                 return NOTFOUND;\r
 866                         break;\r
 867                 case EOL:\r
 868                         if (lp < endp)\r
 869                                 return NOTFOUND;\r
 870                         break;\r
 871                 case BOT:\r
 872                         bopat[*ap++] = lp;\r
 873                         break;\r
 874                 case EOT:\r
 875                         eopat[*ap++] = lp;\r
 876                         break;\r
 877                 case BOW:\r
 878                         if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))\r
 879                                 return NOTFOUND;\r
 880                         break;\r
 881                 case EOW:\r
 882                         if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))\r
 883                                 return NOTFOUND;\r
 884                         break;\r
 885                 case REF:\r
 886                         n = *ap++;\r
 887                         bp = bopat[n];\r
 888                         ep = eopat[n];\r
 889                         while (bp < ep)\r
 890                                 if (ci.CharAt(bp++) != ci.CharAt(lp++))\r
 891                                         return NOTFOUND;\r
 892                         break;\r
 893                 case CLO:\r
 894                         are = lp;\r
 895                         switch (*ap) {\r
 896 \r
 897                         case ANY:\r
 898                                 while (lp < endp)\r
 899                                         lp++;\r
 900                                 n = ANYSKIP;\r
 901                                 break;\r
 902                         case CHR:\r
 903                                 c = *(ap+1);\r
 904                                 while ((lp < endp) && (c == ci.CharAt(lp)))\r
 905                                         lp++;\r
 906                                 n = CHRSKIP;\r
 907                                 break;\r
 908                         case CCL:\r
 909                                 while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))\r
 910                                         lp++;\r
 911                                 n = CCLSKIP;\r
 912                                 break;\r
 913                         default:\r
 914                                 failure = true;\r
 915                                 //re_fail("closure: bad nfa.", *ap);\r
 916                                 return NOTFOUND;\r
 917                         }\r
 918 \r
 919                         ap += n;\r
 920 \r
 921                         while (lp >= are) {\r
 922                                 if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)\r
 923                                         return e;\r
 924                                 --lp;\r
 925                         }\r
 926                         return NOTFOUND;\r
 927                 default:\r
 928                         //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));\r
 929                         return NOTFOUND;\r
 930                 }\r
 931         return lp;\r
 932 }\r
 933 \r
 934 /*\r
 935  * RESearch::Substitute:\r
 936  *  substitute the matched portions of the src in dst.\r
 937  *\r
 938  *  &    substitute the entire matched pattern.\r
 939  *\r
 940  *  \digit  substitute a subpattern, with the given tag number.\r
 941  *      Tags are numbered from 1 to 9. If the particular\r
 942  *      tagged subpattern does not exist, null is substituted.\r
 943  */\r
 944 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {\r
 945         unsigned char c;\r
 946         int  pin;\r
 947         int bp;\r
 948         int ep;\r
 949 \r
 950         if (!*src || !bopat[0])\r
 951                 return 0;\r
 952 \r
 953         while ((c = *src++) != 0) {\r
 954                 switch (c) {\r
 955 \r
 956                 case '&':\r
 957                         pin = 0;\r
 958                         break;\r
 959 \r
 960                 case '\\':\r
 961                         c = *src++;\r
 962                         if (c >= '0' && c <= '9') {\r
 963                                 pin = c - '0';\r
 964                                 break;\r
 965                         }\r
 966 \r
 967                 default:\r
 968                         *dst++ = c;\r
 969                         continue;\r
 970                 }\r
 971 \r
 972                 if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {\r
 973                         while (ci.CharAt(bp) && bp < ep)\r
 974                                 *dst++ = ci.CharAt(bp++);\r
 975                         if (bp < ep)\r
 976                                 return 0;\r
 977                 }\r
 978         }\r
 979         *dst = '\0';\r
 980         return 1;\r
 981 }\r
 982 \r