2 * TclRegComp and TclRegExec -- TclRegSub is elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
25 * *** NOTE: this code has been altered slightly for use in Tcl: ***
26 * *** 1. Use ckalloc and ckfree instead of malloc and free. ***
27 * *** 2. Add extra argument to regexp to specify the real ***
28 * *** start of the string separately from the start of the ***
29 * *** current search. This is needed to search for multiple ***
30 * *** matches within a string. ***
31 * *** 3. Names have been changed, e.g. from regcomp to ***
32 * *** TclRegComp, to avoid clashes with other ***
33 * *** regexp implementations used by applications. ***
34 * *** 4. Added errMsg declaration and TclRegError procedure ***
35 * *** 5. Various lint-like things, such as casting arguments ***
36 * *** in procedure calls. ***
38 * *** NOTE: This code has been altered for use in MT-Sturdy Tcl ***
39 * *** 1. All use of static variables has been changed to access ***
40 * *** fields of a structure. ***
41 * *** 2. This in addition to changes to TclRegError makes the ***
42 * *** code multi-thread safe. ***
51 * The variable below is set to NULL before invoking regexp functions
52 * and checked after those functions. If an error occurred then TclRegError
53 * will set the variable to point to a (static) error message. This
54 * mechanism unfortunately does not support multi-threading, but the
55 * procedures TclRegError and TclGetRegError can be modified to use
56 * thread-specific storage for the variable and thereby make the code
60 static char *errMsg = NULL;
63 * The "internal use only" fields in regexp.h are present to pass info from
64 * compile to execute that permits the execute phase to run lots faster on
65 * simple cases. They are:
67 * regstart char that must begin a match; '\0' if none obvious
68 * reganch is the match anchored (at beginning-of-line only)?
69 * regmust string (pointer into program) that match must include, or NULL
70 * regmlen length of regmust string
72 * Regstart and reganch permit very fast decisions on suitable starting points
73 * for a match, cutting down the work a lot. Regmust permits fast rejection
74 * of lines that cannot possibly match. The regmust tests are costly enough
75 * that TclRegComp() supplies a regmust only if the r.e. contains something
76 * potentially expensive (at present, the only such thing detected is * or +
77 * at the start of the r.e., which can involve a lot of backup). Regmlen is
78 * supplied because the test in TclRegExec() needs it and TclRegComp() is
79 * computing it anyway.
83 * Structure for regexp "program". This is essentially a linear encoding
84 * of a nondeterministic finite-state machine (aka syntax charts or
85 * "railroad normal form" in parsing technology). Each node is an opcode
86 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
87 * all nodes except BRANCH implement concatenation; a "next" pointer with
88 * a BRANCH on both ends of it is connecting two alternatives. (Here we
89 * have one of the subtle syntax dependencies: an individual BRANCH (as
90 * opposed to a collection of them) is never concatenated with anything
91 * because of operator precedence.) The operand of some types of node is
92 * a literal string; for others, it is a node leading into a sub-FSM. In
93 * particular, the operand of a BRANCH node is the first node of the branch.
94 * (NB this is *not* a tree structure: the tail of the branch connects
95 * to the thing following the set of BRANCHes.) The opcodes are:
98 /* definition number opnd? meaning */
99 #define END 0 /* no End of program. */
100 #define BOL 1 /* no Match "" at beginning of line. */
101 #define EOL 2 /* no Match "" at end of line. */
102 #define ANY 3 /* no Match any one character. */
103 #define ANYOF 4 /* str Match any character in this string. */
104 #define ANYBUT 5 /* str Match any character not in this string. */
105 #define BRANCH 6 /* node Match this alternative, or the next... */
106 #define BACK 7 /* no Match "", "next" ptr points backward. */
107 #define EXACTLY 8 /* str Match this string. */
108 #define NOTHING 9 /* no Match empty string. */
109 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
110 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
111 #define OPEN 20 /* no Mark this point in input as start of #n. */
112 /* OPEN+1 is number 1, etc. */
113 #define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */
118 * BRANCH The set of branches constituting a single choice are hooked
119 * together with their "next" pointers, since precedence prevents
120 * anything being concatenated to any individual branch. The
121 * "next" pointer of the last BRANCH in a choice points to the
122 * thing following the whole choice. This is also where the
123 * final "next" pointer of each individual branch points; each
124 * branch starts with the operand node of a BRANCH node.
126 * BACK Normal "next" pointers all implicitly point forward; BACK
127 * exists to make loop structures possible.
129 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
130 * BRANCH structures using BACK. Simple cases (one character
131 * per match) are implemented with STAR and PLUS for speed
132 * and to minimize recursive plunges.
134 * OPEN,CLOSE ...are numbered at compile time.
138 * A node is one char of opcode followed by two chars of "next" pointer.
139 * "Next" pointers are stored as two 8-bit pieces, high order first. The
140 * value is a positive offset from the opcode of the node containing it.
141 * An operand, if any, simply follows the node. (Note that much of the
142 * code generation knows about this implicit relationship.)
144 * Using two bytes for the "next" pointer is vast overkill for most things,
145 * but allows patterns to get big without disasters.
148 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
149 #define OPERAND(p) ((p) + 3)
152 * See regmagic.h for one further detail of program structure.
157 * Utility definitions.
160 #define UCHARAT(p) ((int)*(unsigned char *)(p))
162 #define UCHARAT(p) ((int)*(p)&CHARBITS)
165 #define FAIL(m) { TclRegError(m); return(NULL); }
166 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define META "^$.[()|?+*\\"
170 * Flags to be passed up and down.
172 #define HASWIDTH 01 /* Known never to match null string. */
173 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
174 #define SPSTART 04 /* Starts with * or +. */
175 #define WORST 0 /* Worst case. */
178 * Global work variables for TclRegComp().
180 struct regcomp_state {
181 char *regparse; /* Input-scan pointer. */
182 int regnpar; /* () count. */
183 char *regcode; /* Code-emit pointer; ®dummy = don't. */
184 long regsize; /* Code size. */
187 static char regdummy;
190 * The first byte of the regexp internal "program" is actually this magic
191 * number; the start node begins in the second byte.
197 * Forward declarations for TclRegComp()'s friends.
200 static char * reg _ANSI_ARGS_((int paren, int *flagp,
201 struct regcomp_state *rcstate));
202 static char * regatom _ANSI_ARGS_((int *flagp,
203 struct regcomp_state *rcstate));
204 static char * regbranch _ANSI_ARGS_((int *flagp,
205 struct regcomp_state *rcstate));
206 static void regc _ANSI_ARGS_((int b,
207 struct regcomp_state *rcstate));
208 static void reginsert _ANSI_ARGS_((int op, char *opnd,
209 struct regcomp_state *rcstate));
210 static char * regnext _ANSI_ARGS_((char *p));
211 static char * regnode _ANSI_ARGS_((int op,
212 struct regcomp_state *rcstate));
213 static void regoptail _ANSI_ARGS_((char *p, char *val));
214 static char * regpiece _ANSI_ARGS_((int *flagp,
215 struct regcomp_state *rcstate));
216 static void regtail _ANSI_ARGS_((char *p, char *val));
219 static int strcspn _ANSI_ARGS_((char *s1, char *s2));
223 - TclRegComp - compile a regular expression into internal code
225 * We can't allocate space until we know how big the compiled form will be,
226 * but we can't compile it (and thus know how big it is) until we've got a
227 * place to put the code. So we cheat: we compile it twice, once with code
228 * generation turned off and size counting turned on, and once "for real".
229 * This also means that we don't allocate space until we are sure that the
230 * thing really will compile successfully, and we never have to move the
231 * code and thus invalidate pointers into it. (Note that it has to be in
232 * one piece because free() must be able to free it all.)
234 * Beware that the optimization-preparation code in here knows about some
235 * of the structure of the compiled regexp.
243 register char *longest;
246 struct regcomp_state state;
247 struct regcomp_state *rcstate= &state;
250 FAIL("NULL argument");
252 /* First pass: determine size, legality. */
253 rcstate->regparse = exp;
254 rcstate->regnpar = 1;
255 rcstate->regsize = 0L;
256 rcstate->regcode = ®dummy;
257 regc(MAGIC, rcstate);
258 if (reg(0, &flags, rcstate) == NULL)
261 /* Small enough for pointer-storage convention? */
262 if (rcstate->regsize >= 32767L) /* Probably could be 65535L. */
263 FAIL("regexp too big");
265 /* Allocate space. */
266 r = (regexp *)ckalloc(sizeof(regexp) + (unsigned)rcstate->regsize);
268 FAIL("out of space");
270 /* Second pass: emit code. */
271 rcstate->regparse = exp;
272 rcstate->regnpar = 1;
273 rcstate->regcode = r->program;
274 regc(MAGIC, rcstate);
275 if (reg(0, &flags, rcstate) == NULL)
278 /* Dig out information for optimizations. */
279 r->regstart = '\0'; /* Worst-case defaults. */
283 scan = r->program+1; /* First BRANCH. */
284 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
285 scan = OPERAND(scan);
287 /* Starting-point info. */
288 if (OP(scan) == EXACTLY)
289 r->regstart = *OPERAND(scan);
290 else if (OP(scan) == BOL)
294 * If there's something expensive in the r.e., find the
295 * longest literal string that must appear and make it the
296 * regmust. Resolve ties in favor of later strings, since
297 * the regstart check works with the beginning of the r.e.
298 * and avoiding duplication strengthens checking. Not a
299 * strong reason, but sufficient in the absence of others.
304 for (; scan != NULL; scan = regnext(scan))
305 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
306 longest = OPERAND(scan);
307 len = strlen(OPERAND(scan));
309 r->regmust = longest;
318 - reg - regular expression, i.e. main body or parenthesized thing
320 * Caller must absorb opening parenthesis.
322 * Combining parenthesis handling with the base level of regular expression
323 * is a trifle forced, but the need to tie the tails of the branches to what
324 * follows makes it hard to avoid.
327 reg(paren, flagp, rcstate)
328 int paren; /* Parenthesized? */
330 struct regcomp_state *rcstate;
334 register char *ender;
335 register int parno = 0;
338 *flagp = HASWIDTH; /* Tentatively. */
340 /* Make an OPEN node, if parenthesized. */
342 if (rcstate->regnpar >= NSUBEXP)
344 parno = rcstate->regnpar;
346 ret = regnode(OPEN+parno,rcstate);
350 /* Pick up the branches, linking them together. */
351 br = regbranch(&flags,rcstate);
355 regtail(ret, br); /* OPEN -> first. */
358 if (!(flags&HASWIDTH))
360 *flagp |= flags&SPSTART;
361 while (*rcstate->regparse == '|') {
363 br = regbranch(&flags,rcstate);
366 regtail(ret, br); /* BRANCH -> BRANCH. */
367 if (!(flags&HASWIDTH))
369 *flagp |= flags&SPSTART;
372 /* Make a closing node, and hook it on the end. */
373 ender = regnode((paren) ? CLOSE+parno : END,rcstate);
376 /* Hook the tails of the branches to the closing node. */
377 for (br = ret; br != NULL; br = regnext(br))
378 regoptail(br, ender);
380 /* Check for proper termination. */
381 if (paren && *rcstate->regparse++ != ')') {
382 FAIL("unmatched ()");
383 } else if (!paren && *rcstate->regparse != '\0') {
384 if (*rcstate->regparse == ')') {
385 FAIL("unmatched ()");
387 FAIL("junk on end"); /* "Can't happen". */
395 - regbranch - one alternative of an | operator
397 * Implements the concatenation operator.
400 regbranch(flagp, rcstate)
402 struct regcomp_state *rcstate;
405 register char *chain;
406 register char *latest;
409 *flagp = WORST; /* Tentatively. */
411 ret = regnode(BRANCH,rcstate);
413 while (*rcstate->regparse != '\0' && *rcstate->regparse != '|' &&
414 *rcstate->regparse != ')') {
415 latest = regpiece(&flags, rcstate);
418 *flagp |= flags&HASWIDTH;
419 if (chain == NULL) /* First piece. */
420 *flagp |= flags&SPSTART;
422 regtail(chain, latest);
425 if (chain == NULL) /* Loop ran zero times. */
426 (void) regnode(NOTHING,rcstate);
432 - regpiece - something followed by possible [*+?]
434 * Note that the branching code sequences used for ? and the general cases
435 * of * and + are somewhat optimized: they use the same NOTHING node as
436 * both the endmarker for their branch list and the body of the last branch.
437 * It might seem that this node could be dispensed with entirely, but the
438 * endmarker role is not redundant.
441 regpiece(flagp, rcstate)
443 struct regcomp_state *rcstate;
450 ret = regatom(&flags,rcstate);
454 op = *rcstate->regparse;
460 if (!(flags&HASWIDTH) && op != '?')
461 FAIL("*+ operand could be empty");
462 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
464 if (op == '*' && (flags&SIMPLE))
465 reginsert(STAR, ret, rcstate);
466 else if (op == '*') {
467 /* Emit x* as (x&|), where & means "self". */
468 reginsert(BRANCH, ret, rcstate); /* Either x */
469 regoptail(ret, regnode(BACK,rcstate)); /* and loop */
470 regoptail(ret, ret); /* back */
471 regtail(ret, regnode(BRANCH,rcstate)); /* or */
472 regtail(ret, regnode(NOTHING,rcstate)); /* null. */
473 } else if (op == '+' && (flags&SIMPLE))
474 reginsert(PLUS, ret, rcstate);
475 else if (op == '+') {
476 /* Emit x+ as x(&|), where & means "self". */
477 next = regnode(BRANCH,rcstate); /* Either */
479 regtail(regnode(BACK,rcstate), ret); /* loop back */
480 regtail(next, regnode(BRANCH,rcstate)); /* or */
481 regtail(ret, regnode(NOTHING,rcstate)); /* null. */
482 } else if (op == '?') {
483 /* Emit x? as (x|) */
484 reginsert(BRANCH, ret, rcstate); /* Either x */
485 regtail(ret, regnode(BRANCH,rcstate)); /* or */
486 next = regnode(NOTHING,rcstate); /* null. */
488 regoptail(ret, next);
491 if (ISMULT(*rcstate->regparse))
498 - regatom - the lowest level
500 * Optimization: gobbles an entire sequence of ordinary characters so that
501 * it can turn them into a single node, which is smaller to store and
502 * faster to run. Backslashed characters are exceptions, each becoming a
503 * separate node; the code is simpler that way and it's not worth fixing.
506 regatom(flagp, rcstate)
508 struct regcomp_state *rcstate;
513 *flagp = WORST; /* Tentatively. */
515 switch (*rcstate->regparse++) {
517 ret = regnode(BOL,rcstate);
520 ret = regnode(EOL,rcstate);
523 ret = regnode(ANY,rcstate);
524 *flagp |= HASWIDTH|SIMPLE;
528 register int classend;
530 if(rcstate->regparse[0] != '\\' &&
531 rcstate->regparse[0] != '^' &&
532 rcstate->regparse[1] == ']') {
533 ret = regnode(EXACTLY,rcstate);
534 regc(*rcstate->regparse++,rcstate);
537 *flagp |= HASWIDTH|SIMPLE;
540 if (*rcstate->regparse == '^') { /* Complement of range. */
541 ret = regnode(ANYBUT,rcstate);
544 ret = regnode(ANYOF,rcstate);
545 if (*rcstate->regparse == ']' || *rcstate->regparse == '-')
546 regc(*rcstate->regparse++,rcstate);
547 while (*rcstate->regparse != '\0' && *rcstate->regparse != ']') {
548 if (*rcstate->regparse == '-') {
550 if (*rcstate->regparse == ']' || *rcstate->regparse == '\0')
553 clss = UCHARAT(rcstate->regparse-2)+1;
554 classend = UCHARAT(rcstate->regparse);
555 if (clss > classend+1)
556 FAIL("invalid [] range");
557 for (; clss <= classend; clss++)
558 regc((char)clss,rcstate);
562 regc(*rcstate->regparse++,rcstate);
565 if (*rcstate->regparse != ']')
566 FAIL("unmatched []");
568 *flagp |= HASWIDTH|SIMPLE;
572 ret = reg(1, &flags, rcstate);
575 *flagp |= flags&(HASWIDTH|SPSTART);
580 FAIL("internal urp"); /* Supposed to be caught earlier. */
585 FAIL("?+* follows nothing");
588 if (*rcstate->regparse == '\0')
590 ret = regnode(EXACTLY,rcstate);
591 regc(*rcstate->regparse++,rcstate);
593 *flagp |= HASWIDTH|SIMPLE;
600 len = strcspn(rcstate->regparse, META);
602 FAIL("internal disaster");
603 ender = *(rcstate->regparse+len);
604 if (len > 1 && ISMULT(ender))
605 len--; /* Back off clear of ?+* operand. */
609 ret = regnode(EXACTLY,rcstate);
611 regc(*rcstate->regparse++,rcstate);
623 - regnode - emit a node
625 static char * /* Location. */
628 struct regcomp_state *rcstate;
633 ret = rcstate->regcode;
634 if (ret == ®dummy) {
635 rcstate->regsize += 3;
641 *ptr++ = '\0'; /* Null "next" pointer. */
643 rcstate->regcode = ptr;
649 - regc - emit (if appropriate) a byte of code
654 struct regcomp_state *rcstate;
656 if (rcstate->regcode != ®dummy)
657 *rcstate->regcode++ = (char)b;
663 - reginsert - insert an operator in front of already-emitted operand
665 * Means relocating the operand.
668 reginsert(op, opnd, rcstate)
671 struct regcomp_state *rcstate;
675 register char *place;
677 if (rcstate->regcode == ®dummy) {
678 rcstate->regsize += 3;
682 src = rcstate->regcode;
683 rcstate->regcode += 3;
684 dst = rcstate->regcode;
688 place = opnd; /* Op node, where operand used to be. */
695 - regtail - set the next-pointer at the end of a node chain
709 /* Find last node. */
712 temp = regnext(scan);
718 if (OP(scan) == BACK)
722 *(scan+1) = (char)((offset>>8)&0377);
723 *(scan+2) = (char)(offset&0377);
727 - regoptail - regtail on operand of first argument; nop if operandless
734 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
735 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
737 regtail(OPERAND(p), val);
741 * TclRegExec and friends
745 * Global work variables for TclRegExec().
747 struct regexec_state {
748 char *reginput; /* String-input pointer. */
749 char *regbol; /* Beginning of input, for ^ check. */
750 char **regstartp; /* Pointer to startp array. */
751 char **regendp; /* Ditto for endp. */
757 static int regtry _ANSI_ARGS_((regexp *prog, char *string,
758 struct regexec_state *restate));
759 static int regmatch _ANSI_ARGS_((char *prog,
760 struct regexec_state *restate));
761 static int regrepeat _ANSI_ARGS_((char *p,
762 struct regexec_state *restate));
766 void regdump _ANSI_ARGS_((regexp *r));
767 static char *regprop _ANSI_ARGS_((char *op));
771 - TclRegExec - match a regexp against a string
774 TclRegExec(prog, string, start)
775 register regexp *prog;
776 register char *string;
780 struct regexec_state state;
781 struct regexec_state *restate= &state;
784 if (prog == NULL || string == NULL) {
785 TclRegError("NULL parameter");
789 /* Check validity of program. */
790 if (UCHARAT(prog->program) != MAGIC) {
791 TclRegError("corrupted program");
795 /* If there is a "must appear" string, look for it. */
796 if (prog->regmust != NULL) {
798 while ((s = strchr(s, prog->regmust[0])) != NULL) {
799 if (strncmp(s, prog->regmust, (size_t) prog->regmlen)
801 break; /* Found it. */
804 if (s == NULL) /* Not present. */
808 /* Mark beginning of line for ^ . */
809 restate->regbol = start;
811 /* Simplest case: anchored match need be tried only once. */
813 return(regtry(prog, string, restate));
815 /* Messy cases: unanchored match. */
817 if (prog->regstart != '\0')
818 /* We know what char it must start with. */
819 while ((s = strchr(s, prog->regstart)) != NULL) {
820 if (regtry(prog, s, restate))
825 /* We don't -- general case. */
827 if (regtry(prog, s, restate))
829 } while (*s++ != '\0');
836 - regtry - try match at specific point
838 static int /* 0 failure, 1 success */
839 regtry(prog, string, restate)
842 struct regexec_state *restate;
848 restate->reginput = string;
849 restate->regstartp = prog->startp;
850 restate->regendp = prog->endp;
854 for (i = NSUBEXP; i > 0; i--) {
858 if (regmatch(prog->program + 1,restate)) {
859 prog->startp[0] = string;
860 prog->endp[0] = restate->reginput;
867 - regmatch - main matching routine
869 * Conceptually the strategy is simple: check to see whether the current
870 * node matches, call self recursively to see whether the rest matches,
871 * and then act accordingly. In practice we make some effort to avoid
872 * recursion, in particular by going through "ordinary" nodes (that don't
873 * need to know whether the rest of the match failed) by a loop instead of
876 static int /* 0 failure, 1 success */
877 regmatch(prog, restate)
879 struct regexec_state *restate;
881 register char *scan; /* Current node. */
882 char *next; /* Next node. */
886 if (scan != NULL && regnarrate)
887 fprintf(stderr, "%s(\n", regprop(scan));
889 while (scan != NULL) {
892 fprintf(stderr, "%s...\n", regprop(scan));
894 next = regnext(scan);
898 if (restate->reginput != restate->regbol) {
903 if (*restate->reginput != '\0') {
908 if (*restate->reginput == '\0') {
917 opnd = OPERAND(scan);
918 /* Inline the first character, for speed. */
919 if (*opnd != *restate->reginput) {
923 if (len > 1 && strncmp(opnd, restate->reginput, (size_t) len)
927 restate->reginput += len;
931 if (*restate->reginput == '\0'
932 || strchr(OPERAND(scan), *restate->reginput) == NULL) {
938 if (*restate->reginput == '\0'
939 || strchr(OPERAND(scan), *restate->reginput) != NULL) {
961 no = OP(scan) - OPEN;
962 save = restate->reginput;
964 if (regmatch(next,restate)) {
966 * Don't set startp if some later invocation of the
967 * same parentheses already has.
969 if (restate->regstartp[no] == NULL) {
970 restate->regstartp[no] = save;
990 no = OP(scan) - CLOSE;
991 save = restate->reginput;
993 if (regmatch(next,restate)) {
995 * Don't set endp if some later
996 * invocation of the same parentheses
999 if (restate->regendp[no] == NULL)
1000 restate->regendp[no] = save;
1007 register char *save;
1009 if (OP(next) != BRANCH) { /* No choice. */
1010 next = OPERAND(scan); /* Avoid recursion. */
1013 save = restate->reginput;
1014 if (regmatch(OPERAND(scan),restate))
1016 restate->reginput = save;
1017 scan = regnext(scan);
1018 } while (scan != NULL && OP(scan) == BRANCH);
1025 register char nextch;
1027 register char *save;
1031 * Lookahead to avoid useless match attempts
1032 * when we know what character comes next.
1035 if (OP(next) == EXACTLY)
1036 nextch = *OPERAND(next);
1037 min = (OP(scan) == STAR) ? 0 : 1;
1038 save = restate->reginput;
1039 no = regrepeat(OPERAND(scan),restate);
1041 /* If it could work, try it. */
1042 if (nextch == '\0' || *restate->reginput == nextch)
1043 if (regmatch(next,restate))
1045 if (nextch != '\0' && no > (min + 1)) {
1046 char tmp = *(save + no);
1049 p = strrchr(save, nextch);
1057 /* Couldn't or didn't -- back up. */
1059 restate->reginput = save + no;
1064 return(1); /* Success! */
1066 if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {
1068 } else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {
1071 TclRegError("memory corruption");
1079 * We get here only if there's trouble -- normally "case END" is
1080 * the terminating point.
1082 TclRegError("corrupted pointers");
1087 - regrepeat - repeatedly match something simple, report how many
1090 regrepeat(p, restate)
1092 struct regexec_state *restate;
1094 register int count = 0;
1095 register char *scan;
1096 register char *opnd;
1098 scan = restate->reginput;
1102 count = strlen(scan);
1106 while (*opnd == *scan) {
1112 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1118 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1123 default: /* Oh dear. Called inappropriately. */
1124 TclRegError("internal foulup");
1125 count = 0; /* Best compromise. */
1128 restate->reginput = scan;
1134 - regnext - dig the "next" pointer out of a node
1140 register int offset;
1157 static char *regprop();
1160 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1167 register char op = EXACTLY; /* Arbitrary non-END op. */
1168 register char *next;
1172 while (op != END) { /* While that wasn't END last time... */
1174 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1176 if (next == NULL) /* Next ptr. */
1179 printf("(%d)", (s-r->program)+(next-s));
1181 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1182 /* Literal string, where present. */
1183 while (*s != '\0') {
1192 /* Header fields of interest. */
1193 if (r->regstart != '\0')
1194 printf("start `%c' ", r->regstart);
1196 printf("anchored ");
1197 if (r->regmust != NULL)
1198 printf("must have \"%s\"", r->regmust);
1203 - regprop - printable representation of opcode
1210 static char buf[50];
1212 (void) strcpy(buf, ":");
1254 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1266 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1276 if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {
1277 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1280 } else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {
1281 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1284 TclRegError("corrupted opcode");
1289 (void) strcat(buf, p);
1295 * The following is provided for those people who do not have strcspn() in
1296 * their C libraries. They should get off their butts and do something
1297 * about it; at least one public-domain implementation of those (highly
1298 * useful) string routines has been published on Usenet.
1302 * strcspn - find length of initial segment of s1 consisting entirely
1303 * of characters not from s2
1311 register char *scan1;
1312 register char *scan2;
1316 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1317 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1318 if (*scan1 == *scan2++)
1327 *----------------------------------------------------------------------
1331 * This procedure is invoked by the regexp code when an error
1332 * occurs. It saves the error message so it can be seen by the
1333 * code that called Spencer's code.
1339 * The value of "string" is saved in "errMsg".
1341 *----------------------------------------------------------------------
1346 char *string; /* Error message. */