1 ------------------------------------------------------------------------------
3 -- GNAT LIBRARY COMPONENTS --
5 -- G N A T . R E G P A T --
9 -- Copyright (C) 1986 by University of Toronto. --
10 -- Copyright (C) 1996-2003 Ada Core Technologies, Inc. --
12 -- GNAT is free software; you can redistribute it and/or modify it under --
13 -- terms of the GNU General Public License as published by the Free Soft- --
14 -- ware Foundation; either version 2, or (at your option) any later ver- --
15 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
16 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
17 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
18 -- for more details. You should have received a copy of the GNU General --
19 -- Public License distributed with GNAT; see file COPYING. If not, write --
20 -- to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, --
21 -- MA 02111-1307, USA. --
23 -- As a special exception, if other files instantiate generics from this --
24 -- unit, or you link this unit with other files to produce an executable, --
25 -- this unit does not by itself cause the resulting executable to be --
26 -- covered by the GNU General Public License. This exception does not --
27 -- however invalidate any other reasons why the executable file might be --
28 -- covered by the GNU Public License. --
30 -- GNAT was originally developed by the GNAT team at New York University. --
31 -- Extensive contributions were provided by Ada Core Technologies Inc. --
33 ------------------------------------------------------------------------------
35 -- This is an altered Ada 95 version of the original V8 style regular
36 -- expression library written in C by Henry Spencer. Apart from the
37 -- translation to Ada, the interface has been considerably changed to
38 -- use the Ada String type instead of C-style nul-terminated strings.
40 -- Beware that some of this code is subtly aware of the way operator
41 -- precedence is structured in regular expressions. Serious changes in
42 -- regular-expression syntax might require a total rethink.
44 with System.IO; use System.IO;
45 with Ada.Characters.Handling; use Ada.Characters.Handling;
46 with Unchecked_Conversion;
48 package body GNAT.Regpat is
50 MAGIC : constant Character := Character'Val (10#0234#);
51 -- The first byte of the regexp internal "program" is actually
52 -- this magic number; the start node begins in the second byte.
54 -- This is used to make sure that a regular expression was correctly
57 ----------------------------
58 -- Implementation details --
59 ----------------------------
61 -- This is essentially a linear encoding of a nondeterministic
62 -- finite-state machine, also known as syntax charts or
63 -- "railroad normal form" in parsing technology.
65 -- Each node is an opcode plus a "next" pointer, possibly plus an
66 -- operand. "Next" pointers of all nodes except BRANCH implement
67 -- concatenation; a "next" pointer with a BRANCH on both ends of it
68 -- is connecting two alternatives.
70 -- The operand of some types of node is a literal string; for others,
71 -- it is a node leading into a sub-FSM. In particular, the operand of
72 -- a BRANCH node is the first node of the branch.
73 -- (NB this is *not* a tree structure: the tail of the branch connects
74 -- to the thing following the set of BRANCHes).
76 -- You can see the exact byte-compiled version by using the Dump
77 -- subprogram. However, here are a few examples:
80 -- 2 : BRANCH (next at 10)
81 -- 5 : EXACT (next at 18) operand=a
82 -- 10 : BRANCH (next at 18)
83 -- 13 : EXACT (next at 18) operand=b
84 -- 18 : EOP (next at 0)
87 -- 2 : CURLYX (next at 26) { 0, 32767}
88 -- 9 : OPEN 1 (next at 13)
89 -- 13 : EXACT (next at 19) operand=ab
90 -- 19 : CLOSE 1 (next at 23)
91 -- 23 : WHILEM (next at 0)
92 -- 26 : NOTHING (next at 29)
93 -- 29 : EOP (next at 0)
99 -- Name Operand? Meaning
101 (EOP, -- no End of program
102 MINMOD, -- no Next operator is not greedy
104 -- Classes of characters
106 ANY, -- no Match any one character except newline
107 SANY, -- no Match any character, including new line
108 ANYOF, -- class Match any character in this class
109 EXACT, -- str Match this string exactly
110 EXACTF, -- str Match this string (case-folding is one)
111 NOTHING, -- no Match empty string
112 SPACE, -- no Match any whitespace character
113 NSPACE, -- no Match any non-whitespace character
114 DIGIT, -- no Match any numeric character
115 NDIGIT, -- no Match any non-numeric character
116 ALNUM, -- no Match any alphanumeric character
117 NALNUM, -- no Match any non-alphanumeric character
121 BRANCH, -- node Match this alternative, or the next
123 -- Simple loops (when the following node is one character in length)
125 STAR, -- node Match this simple thing 0 or more times
126 PLUS, -- node Match this simple thing 1 or more times
127 CURLY, -- 2num node Match this simple thing between n and m times.
131 CURLYX, -- 2num node Match this complex thing {n,m} times
132 -- The nums are coded on two characters each.
134 WHILEM, -- no Do curly processing and see if rest matches
136 -- Matches after or before a word
138 BOL, -- no Match "" at beginning of line
139 MBOL, -- no Same, assuming mutiline (match after \n)
140 SBOL, -- no Same, assuming single line (don't match at \n)
141 EOL, -- no Match "" at end of line
142 MEOL, -- no Same, assuming mutiline (match before \n)
143 SEOL, -- no Same, assuming single line (don't match at \n)
145 BOUND, -- no Match "" at any word boundary
146 NBOUND, -- no Match "" at any word non-boundary
148 -- Parenthesis groups handling
150 REFF, -- num Match some already matched string, folded
151 OPEN, -- num Mark this point in input as start of #n
152 CLOSE); -- num Analogous to OPEN
154 for Opcode'Size use 8;
159 -- The set of branches constituting a single choice are hooked
160 -- together with their "next" pointers, since precedence prevents
161 -- anything being concatenated to any individual branch. The
162 -- "next" pointer of the last BRANCH in a choice points to the
163 -- thing following the whole choice. This is also where the
164 -- final "next" pointer of each individual branch points; each
165 -- branch starts with the operand node of a BRANCH node.
168 -- '?', and complex '*' and '+', are implemented with CURLYX.
169 -- branches. Simple cases (one character per match) are implemented with
170 -- STAR and PLUS for speed and to minimize recursive plunges.
173 -- ...are numbered at compile time.
176 -- There are in fact two arguments, the first one is the length (minus
177 -- one of the string argument), coded on one character, the second
178 -- argument is the string itself, coded on length + 1 characters.
180 -- A node is one char of opcode followed by two chars of "next" pointer.
181 -- "Next" pointers are stored as two 8-bit pieces, high order first. The
182 -- value is a positive offset from the opcode of the node containing it.
183 -- An operand, if any, simply follows the node. (Note that much of the
184 -- code generation knows about this implicit relationship.)
186 -- Using two bytes for the "next" pointer is vast overkill for most
187 -- things, but allows patterns to get big without disasters.
189 -----------------------
190 -- Character classes --
191 -----------------------
192 -- This is the implementation for character classes ([...]) in the
193 -- syntax for regular expressions. Each character (0..256) has an
194 -- entry into the table. This makes for a very fast matching
197 type Class_Byte is mod 256;
198 type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte;
200 type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte;
201 Bit_Conversion : constant Bit_Conversion_Array :=
202 (1, 2, 4, 8, 16, 32, 64, 128);
204 type Std_Class is (ANYOF_NONE,
205 ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9]
207 ANYOF_SPACE, -- Space class [ \t\n\r\f]
209 ANYOF_DIGIT, -- Digit class [0-9]
211 ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9]
213 ANYOF_ALPHA, -- Alpha class [a-zA-Z]
215 ANYOF_ASCII, -- Ascii class (7 bits) 0..127
217 ANYOF_CNTRL, -- Control class
219 ANYOF_GRAPH, -- Graphic class
221 ANYOF_LOWER, -- Lower case class [a-z]
223 ANYOF_PRINT, -- printable class
227 ANYOF_UPPER, -- Upper case class [A-Z]
229 ANYOF_XDIGIT, -- Hexadecimal digit
233 procedure Set_In_Class
234 (Bitmap : in out Character_Class;
236 -- Set the entry to True for C in the class Bitmap.
238 function Get_From_Class
239 (Bitmap : Character_Class;
242 -- Return True if the entry is set for C in the class Bitmap.
244 procedure Reset_Class (Bitmap : out Character_Class);
245 -- Clear all the entries in the class Bitmap.
247 pragma Inline (Set_In_Class);
248 pragma Inline (Get_From_Class);
249 pragma Inline (Reset_Class);
251 -----------------------
252 -- Local Subprograms --
253 -----------------------
255 function "=" (Left : Character; Right : Opcode) return Boolean;
257 function Is_Alnum (C : Character) return Boolean;
258 -- Return True if C is an alphanum character or an underscore ('_')
260 function Is_White_Space (C : Character) return Boolean;
261 -- Return True if C is a whitespace character
263 function Is_Printable (C : Character) return Boolean;
264 -- Return True if C is a printable character
266 function Operand (P : Pointer) return Pointer;
267 -- Return a pointer to the first operand of the node at P
269 function String_Length
270 (Program : Program_Data;
273 -- Return the length of the string argument of the node at P
275 function String_Operand (P : Pointer) return Pointer;
276 -- Return a pointer to the string argument of the node at P
278 procedure Bitmap_Operand
279 (Program : Program_Data;
281 Op : out Character_Class);
282 -- Return a pointer to the string argument of the node at P
284 function Get_Next_Offset
285 (Program : Program_Data;
288 -- Get the offset field of a node. Used by Get_Next.
291 (Program : Program_Data;
294 -- Dig the next instruction pointer out of a node
296 procedure Optimize (Self : in out Pattern_Matcher);
297 -- Optimize a Pattern_Matcher by noting certain special cases
299 function Read_Natural
300 (Program : Program_Data;
303 -- Return the 2-byte natural coded at position IP.
305 -- All of the subprograms above are tiny and should be inlined
308 pragma Inline (Is_Alnum);
309 pragma Inline (Is_White_Space);
310 pragma Inline (Get_Next);
311 pragma Inline (Get_Next_Offset);
312 pragma Inline (Operand);
313 pragma Inline (Read_Natural);
314 pragma Inline (String_Length);
315 pragma Inline (String_Operand);
317 type Expression_Flags is record
318 Has_Width, -- Known never to match null string
319 Simple, -- Simple enough to be STAR/PLUS operand
320 SP_Start : Boolean; -- Starts with * or +
323 Worst_Expression : constant Expression_Flags := (others => False);
330 function "=" (Left : Character; Right : Opcode) return Boolean is
332 return Character'Pos (Left) = Opcode'Pos (Right);
339 procedure Bitmap_Operand
340 (Program : Program_Data;
342 Op : out Character_Class)
344 function Convert is new Unchecked_Conversion
345 (Program_Data, Character_Class);
348 Op (0 .. 31) := Convert (Program (P + 3 .. P + 34));
356 (Matcher : out Pattern_Matcher;
358 Final_Code_Size : out Program_Size;
359 Flags : Regexp_Flags := No_Flags)
361 -- We can't allocate space until we know how big the compiled form
362 -- will be, but we can't compile it (and thus know how big it is)
363 -- until we've got a place to put the code. So we cheat: we compile
364 -- it twice, once with code generation turned off and size counting
365 -- turned on, and once "for real".
367 -- This also means that we don't allocate space until we are sure
368 -- that the thing really will compile successfully, and we never
369 -- have to move the code and thus invalidate pointers into it.
371 -- Beware that the optimization-preparation code in here knows
372 -- about some of the structure of the compiled regexp.
374 PM : Pattern_Matcher renames Matcher;
375 Program : Program_Data renames PM.Program;
377 Emit_Code : constant Boolean := PM.Size > 0;
378 Emit_Ptr : Pointer := Program_First;
380 Parse_Pos : Natural := Expression'First; -- Input-scan pointer
381 Parse_End : constant Natural := Expression'Last;
383 ----------------------------
384 -- Subprograms for Create --
385 ----------------------------
387 procedure Emit (B : Character);
388 -- Output the Character B to the Program. If code-generation is
389 -- disabled, simply increments the program counter.
391 function Emit_Node (Op : Opcode) return Pointer;
392 -- If code-generation is enabled, Emit_Node outputs the
393 -- opcode Op and reserves space for a pointer to the next node.
394 -- Return value is the location of new opcode, ie old Emit_Ptr.
396 procedure Emit_Natural (IP : Pointer; N : Natural);
397 -- Split N on two characters at position IP.
399 procedure Emit_Class (Bitmap : Character_Class);
400 -- Emits a character class.
402 procedure Case_Emit (C : Character);
403 -- Emit C, after converting is to lower-case if the regular
404 -- expression is case insensitive.
407 (Parenthesized : Boolean;
408 Flags : out Expression_Flags;
410 -- Parse regular expression, i.e. main body or parenthesized thing
411 -- Caller must absorb opening parenthesis.
413 procedure Parse_Branch
414 (Flags : out Expression_Flags;
417 -- Implements the concatenation operator and handles '|'
418 -- First should be true if this is the first item of the alternative.
420 procedure Parse_Piece
421 (Expr_Flags : out Expression_Flags;
423 -- Parse something followed by possible [*+?]
426 (Expr_Flags : out Expression_Flags;
428 -- Parse_Atom is the lowest level parse procedure.
429 -- Optimization: gobbles an entire sequence of ordinary characters
430 -- so that it can turn them into a single node, which is smaller to
431 -- store and faster to run. Backslashed characters are exceptions,
432 -- each becoming a separate node; the code is simpler that way and
433 -- it's not worth fixing.
435 procedure Insert_Operator
438 Greedy : Boolean := True);
439 -- Insert_Operator inserts an operator in front of an
440 -- already-emitted operand and relocates the operand.
441 -- This applies to PLUS and STAR.
442 -- If Minmod is True, then the operator is non-greedy.
444 procedure Insert_Curly_Operator
449 Greedy : Boolean := True);
450 -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}).
451 -- If Minmod is True, then the operator is non-greedy.
453 procedure Link_Tail (P, Val : Pointer);
454 -- Link_Tail sets the next-pointer at the end of a node chain
456 procedure Link_Operand_Tail (P, Val : Pointer);
457 -- Link_Tail on operand of first argument; nop if operandless
459 function Next_Instruction (P : Pointer) return Pointer;
460 -- Dig the "next" pointer out of a node
462 procedure Fail (M : in String);
463 pragma No_Return (Fail);
464 -- Fail with a diagnostic message, if possible
466 function Is_Curly_Operator (IP : Natural) return Boolean;
467 -- Return True if IP is looking at a '{' that is the beginning
468 -- of a curly operator, ie it matches {\d+,?\d*}
470 function Is_Mult (IP : Natural) return Boolean;
471 -- Return True if C is a regexp multiplier: '+', '*' or '?'
473 procedure Get_Curly_Arguments
477 Greedy : out Boolean);
478 -- Parse the argument list for a curly operator.
479 -- It is assumed that IP is indeed pointing at a valid operator.
480 -- So what is IP and how come IP is not referenced in the body ???
482 procedure Parse_Character_Class (IP : out Pointer);
483 -- Parse a character class.
484 -- The calling subprogram should consume the opening '[' before.
486 procedure Parse_Literal
487 (Expr_Flags : out Expression_Flags;
489 -- Parse_Literal encodes a string of characters to be matched exactly
491 function Parse_Posix_Character_Class return Std_Class;
492 -- Parse a posic character class, like [:alpha:] or [:^alpha:].
493 -- The called is suppoed to absorbe the opening [.
495 pragma Inline (Is_Mult);
496 pragma Inline (Emit_Natural);
497 pragma Inline (Parse_Character_Class); -- since used only once
503 procedure Case_Emit (C : Character) is
505 if (Flags and Case_Insensitive) /= 0 then
509 -- Dump current character
519 procedure Emit (B : Character) is
522 Program (Emit_Ptr) := B;
525 Emit_Ptr := Emit_Ptr + 1;
532 procedure Emit_Class (Bitmap : Character_Class) is
533 subtype Program31 is Program_Data (0 .. 31);
535 function Convert is new Unchecked_Conversion
536 (Character_Class, Program31);
540 Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap);
543 Emit_Ptr := Emit_Ptr + 32;
550 procedure Emit_Natural (IP : Pointer; N : Natural) is
553 Program (IP + 1) := Character'Val (N / 256);
554 Program (IP) := Character'Val (N mod 256);
562 function Emit_Node (Op : Opcode) return Pointer is
563 Result : constant Pointer := Emit_Ptr;
567 Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op));
568 Program (Emit_Ptr + 1) := ASCII.NUL;
569 Program (Emit_Ptr + 2) := ASCII.NUL;
572 Emit_Ptr := Emit_Ptr + 3;
580 procedure Fail (M : in String) is
582 raise Expression_Error;
585 -------------------------
586 -- Get_Curly_Arguments --
587 -------------------------
589 procedure Get_Curly_Arguments
593 Greedy : out Boolean)
595 pragma Unreferenced (IP);
597 Save_Pos : Natural := Parse_Pos + 1;
601 Max := Max_Curly_Repeat;
603 while Expression (Parse_Pos) /= '}'
604 and then Expression (Parse_Pos) /= ','
606 Parse_Pos := Parse_Pos + 1;
609 Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1));
611 if Expression (Parse_Pos) = ',' then
612 Save_Pos := Parse_Pos + 1;
613 while Expression (Parse_Pos) /= '}' loop
614 Parse_Pos := Parse_Pos + 1;
617 if Save_Pos /= Parse_Pos then
618 Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1));
625 if Parse_Pos < Expression'Last
626 and then Expression (Parse_Pos + 1) = '?'
629 Parse_Pos := Parse_Pos + 1;
634 end Get_Curly_Arguments;
636 ---------------------------
637 -- Insert_Curly_Operator --
638 ---------------------------
640 procedure Insert_Curly_Operator
645 Greedy : Boolean := True)
647 Dest : constant Pointer := Emit_Ptr;
652 -- If the operand is not greedy, insert an extra operand before it
658 -- Move the operand in the byte-compilation, so that we can insert
659 -- the operator before it.
662 Program (Operand + Size .. Emit_Ptr + Size) :=
663 Program (Operand .. Emit_Ptr);
666 -- Insert the operator at the position previously occupied by the
672 Old := Emit_Node (MINMOD);
673 Link_Tail (Old, Old + 3);
676 Old := Emit_Node (Op);
677 Emit_Natural (Old + 3, Min);
678 Emit_Natural (Old + 5, Max);
680 Emit_Ptr := Dest + Size;
681 end Insert_Curly_Operator;
683 ---------------------
684 -- Insert_Operator --
685 ---------------------
687 procedure Insert_Operator
690 Greedy : Boolean := True)
692 Dest : constant Pointer := Emit_Ptr;
697 -- If not greedy, we have to emit another opcode first
703 -- Move the operand in the byte-compilation, so that we can insert
704 -- the operator before it.
707 Program (Operand + Size .. Emit_Ptr + Size) :=
708 Program (Operand .. Emit_Ptr);
711 -- Insert the operator at the position previously occupied by the
717 Old := Emit_Node (MINMOD);
718 Link_Tail (Old, Old + 3);
721 Old := Emit_Node (Op);
722 Emit_Ptr := Dest + Size;
725 -----------------------
726 -- Is_Curly_Operator --
727 -----------------------
729 function Is_Curly_Operator (IP : Natural) return Boolean is
730 Scan : Natural := IP;
733 if Expression (Scan) /= '{'
734 or else Scan + 2 > Expression'Last
735 or else not Is_Digit (Expression (Scan + 1))
747 if Scan > Expression'Last then
751 exit when not Is_Digit (Expression (Scan));
754 if Expression (Scan) = ',' then
758 if Scan > Expression'Last then
762 exit when not Is_Digit (Expression (Scan));
766 return Expression (Scan) = '}';
767 end Is_Curly_Operator;
773 function Is_Mult (IP : Natural) return Boolean is
774 C : constant Character := Expression (IP);
780 or else (C = '{' and then Is_Curly_Operator (IP));
783 -----------------------
784 -- Link_Operand_Tail --
785 -----------------------
787 procedure Link_Operand_Tail (P, Val : Pointer) is
789 if Emit_Code and then Program (P) = BRANCH then
790 Link_Tail (Operand (P), Val);
792 end Link_Operand_Tail;
798 procedure Link_Tail (P, Val : Pointer) is
804 if not Emit_Code then
812 Temp := Next_Instruction (Scan);
817 Offset := Val - Scan;
819 Emit_Natural (Scan + 1, Natural (Offset));
822 ----------------------
823 -- Next_Instruction --
824 ----------------------
826 function Next_Instruction (P : Pointer) return Pointer is
830 if not Emit_Code then
834 Offset := Get_Next_Offset (Program, P);
841 end Next_Instruction;
847 -- Combining parenthesis handling with the base level
848 -- of regular expression is a trifle forced, but the
849 -- need to tie the tails of the branches to what follows
850 -- makes it hard to avoid.
853 (Parenthesized : in Boolean;
854 Flags : out Expression_Flags;
857 E : String renames Expression;
861 New_Flags : Expression_Flags;
862 Have_Branch : Boolean := False;
865 Flags := (Has_Width => True, others => False); -- Tentatively
867 -- Make an OPEN node, if parenthesized
869 if Parenthesized then
870 if Matcher.Paren_Count > Max_Paren_Count then
871 Fail ("too many ()");
874 Par_No := Matcher.Paren_Count + 1;
875 Matcher.Paren_Count := Matcher.Paren_Count + 1;
876 IP := Emit_Node (OPEN);
877 Emit (Character'Val (Par_No));
884 -- Pick up the branches, linking them together
886 Parse_Branch (New_Flags, True, Br);
893 if Parse_Pos <= Parse_End
894 and then E (Parse_Pos) = '|'
896 Insert_Operator (BRANCH, Br);
901 Link_Tail (IP, Br); -- OPEN -> first
906 if not New_Flags.Has_Width then
907 Flags.Has_Width := False;
910 Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
912 while Parse_Pos <= Parse_End
913 and then (E (Parse_Pos) = '|')
915 Parse_Pos := Parse_Pos + 1;
916 Parse_Branch (New_Flags, False, Br);
923 Link_Tail (IP, Br); -- BRANCH -> BRANCH
925 if not New_Flags.Has_Width then
926 Flags.Has_Width := False;
929 Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
932 -- Make a closing node, and hook it on the end
934 if Parenthesized then
935 Ender := Emit_Node (CLOSE);
936 Emit (Character'Val (Par_No));
938 Ender := Emit_Node (EOP);
941 Link_Tail (IP, Ender);
945 -- Hook the tails of the branches to the closing node
950 Link_Operand_Tail (Br, Ender);
951 Br := Next_Instruction (Br);
955 -- Check for proper termination
957 if Parenthesized then
958 if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then
959 Fail ("unmatched ()");
962 Parse_Pos := Parse_Pos + 1;
964 elsif Parse_Pos <= Parse_End then
965 if E (Parse_Pos) = ')' then
966 Fail ("unmatched ()");
968 Fail ("junk on end"); -- "Can't happen"
978 (Expr_Flags : out Expression_Flags;
984 -- Tentatively set worst expression case
986 Expr_Flags := Worst_Expression;
988 C := Expression (Parse_Pos);
989 Parse_Pos := Parse_Pos + 1;
993 if (Flags and Multiple_Lines) /= 0 then
994 IP := Emit_Node (MBOL);
995 elsif (Flags and Single_Line) /= 0 then
996 IP := Emit_Node (SBOL);
998 IP := Emit_Node (BOL);
1002 if (Flags and Multiple_Lines) /= 0 then
1003 IP := Emit_Node (MEOL);
1004 elsif (Flags and Single_Line) /= 0 then
1005 IP := Emit_Node (SEOL);
1007 IP := Emit_Node (EOL);
1011 if (Flags and Single_Line) /= 0 then
1012 IP := Emit_Node (SANY);
1014 IP := Emit_Node (ANY);
1017 Expr_Flags.Has_Width := True;
1018 Expr_Flags.Simple := True;
1021 Parse_Character_Class (IP);
1022 Expr_Flags.Has_Width := True;
1023 Expr_Flags.Simple := True;
1027 New_Flags : Expression_Flags;
1030 Parse (True, New_Flags, IP);
1036 Expr_Flags.Has_Width :=
1037 Expr_Flags.Has_Width or New_Flags.Has_Width;
1038 Expr_Flags.SP_Start :=
1039 Expr_Flags.SP_Start or New_Flags.SP_Start;
1042 when '|' | ASCII.LF | ')' =>
1043 Fail ("internal urp"); -- Supposed to be caught earlier
1045 when '?' | '+' | '*' =>
1046 Fail (C & " follows nothing");
1049 if Is_Curly_Operator (Parse_Pos - 1) then
1050 Fail (C & " follows nothing");
1052 Parse_Literal (Expr_Flags, IP);
1056 if Parse_Pos > Parse_End then
1057 Fail ("trailing \");
1060 Parse_Pos := Parse_Pos + 1;
1062 case Expression (Parse_Pos - 1) is
1064 IP := Emit_Node (BOUND);
1067 IP := Emit_Node (NBOUND);
1070 IP := Emit_Node (SPACE);
1071 Expr_Flags.Simple := True;
1072 Expr_Flags.Has_Width := True;
1075 IP := Emit_Node (NSPACE);
1076 Expr_Flags.Simple := True;
1077 Expr_Flags.Has_Width := True;
1080 IP := Emit_Node (DIGIT);
1081 Expr_Flags.Simple := True;
1082 Expr_Flags.Has_Width := True;
1085 IP := Emit_Node (NDIGIT);
1086 Expr_Flags.Simple := True;
1087 Expr_Flags.Has_Width := True;
1090 IP := Emit_Node (ALNUM);
1091 Expr_Flags.Simple := True;
1092 Expr_Flags.Has_Width := True;
1095 IP := Emit_Node (NALNUM);
1096 Expr_Flags.Simple := True;
1097 Expr_Flags.Has_Width := True;
1100 IP := Emit_Node (SBOL);
1103 IP := Emit_Node (SEOL);
1106 IP := Emit_Node (REFF);
1109 Save : constant Natural := Parse_Pos - 1;
1112 while Parse_Pos <= Expression'Last
1113 and then Is_Digit (Expression (Parse_Pos))
1115 Parse_Pos := Parse_Pos + 1;
1118 Emit (Character'Val (Natural'Value
1119 (Expression (Save .. Parse_Pos - 1))));
1123 Parse_Pos := Parse_Pos - 1;
1124 Parse_Literal (Expr_Flags, IP);
1128 Parse_Literal (Expr_Flags, IP);
1136 procedure Parse_Branch
1137 (Flags : out Expression_Flags;
1141 E : String renames Expression;
1144 New_Flags : Expression_Flags;
1147 pragma Warnings (Off, Discard);
1150 Flags := Worst_Expression; -- Tentatively
1155 IP := Emit_Node (BRANCH);
1160 while Parse_Pos <= Parse_End
1161 and then E (Parse_Pos) /= ')'
1162 and then E (Parse_Pos) /= ASCII.LF
1163 and then E (Parse_Pos) /= '|'
1165 Parse_Piece (New_Flags, Last);
1172 Flags.Has_Width := Flags.Has_Width or New_Flags.Has_Width;
1174 if Chain = 0 then -- First piece
1175 Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
1177 Link_Tail (Chain, Last);
1183 -- Case where loop ran zero CURLY
1186 Discard := Emit_Node (NOTHING);
1190 ---------------------------
1191 -- Parse_Character_Class --
1192 ---------------------------
1194 procedure Parse_Character_Class (IP : out Pointer) is
1195 Bitmap : Character_Class;
1196 Invert : Boolean := False;
1197 In_Range : Boolean := False;
1198 Named_Class : Std_Class := ANYOF_NONE;
1200 Last_Value : Character := ASCII.Nul;
1203 Reset_Class (Bitmap);
1205 -- Do we have an invert character class ?
1207 if Parse_Pos <= Parse_End
1208 and then Expression (Parse_Pos) = '^'
1211 Parse_Pos := Parse_Pos + 1;
1214 -- First character can be ] or -, without closing the class.
1216 if Parse_Pos <= Parse_End
1217 and then (Expression (Parse_Pos) = ']'
1218 or else Expression (Parse_Pos) = '-')
1220 Set_In_Class (Bitmap, Expression (Parse_Pos));
1221 Parse_Pos := Parse_Pos + 1;
1224 -- While we don't have the end of the class
1226 while Parse_Pos <= Parse_End
1227 and then Expression (Parse_Pos) /= ']'
1229 Named_Class := ANYOF_NONE;
1230 Value := Expression (Parse_Pos);
1231 Parse_Pos := Parse_Pos + 1;
1233 -- Do we have a Posix character class
1235 Named_Class := Parse_Posix_Character_Class;
1237 elsif Value = '\' then
1238 if Parse_Pos = Parse_End then
1239 Fail ("Trailing \");
1241 Value := Expression (Parse_Pos);
1242 Parse_Pos := Parse_Pos + 1;
1245 when 'w' => Named_Class := ANYOF_ALNUM;
1246 when 'W' => Named_Class := ANYOF_NALNUM;
1247 when 's' => Named_Class := ANYOF_SPACE;
1248 when 'S' => Named_Class := ANYOF_NSPACE;
1249 when 'd' => Named_Class := ANYOF_DIGIT;
1250 when 'D' => Named_Class := ANYOF_NDIGIT;
1251 when 'n' => Value := ASCII.LF;
1252 when 'r' => Value := ASCII.CR;
1253 when 't' => Value := ASCII.HT;
1254 when 'f' => Value := ASCII.FF;
1255 when 'e' => Value := ASCII.ESC;
1256 when 'a' => Value := ASCII.BEL;
1258 -- when 'x' => ??? hexadecimal value
1259 -- when 'c' => ??? control character
1260 -- when '0'..'9' => ??? octal character
1262 when others => null;
1266 -- Do we have a character class?
1268 if Named_Class /= ANYOF_NONE then
1270 -- A range like 'a-\d' or 'a-[:digit:] is not a range
1273 Set_In_Class (Bitmap, Last_Value);
1274 Set_In_Class (Bitmap, '-');
1281 when ANYOF_NONE => null;
1283 when ANYOF_ALNUM | ANYOF_ALNUMC =>
1284 for Value in Class_Byte'Range loop
1285 if Is_Alnum (Character'Val (Value)) then
1286 Set_In_Class (Bitmap, Character'Val (Value));
1290 when ANYOF_NALNUM | ANYOF_NALNUMC =>
1291 for Value in Class_Byte'Range loop
1292 if not Is_Alnum (Character'Val (Value)) then
1293 Set_In_Class (Bitmap, Character'Val (Value));
1298 for Value in Class_Byte'Range loop
1299 if Is_White_Space (Character'Val (Value)) then
1300 Set_In_Class (Bitmap, Character'Val (Value));
1304 when ANYOF_NSPACE =>
1305 for Value in Class_Byte'Range loop
1306 if not Is_White_Space (Character'Val (Value)) then
1307 Set_In_Class (Bitmap, Character'Val (Value));
1312 for Value in Class_Byte'Range loop
1313 if Is_Digit (Character'Val (Value)) then
1314 Set_In_Class (Bitmap, Character'Val (Value));
1318 when ANYOF_NDIGIT =>
1319 for Value in Class_Byte'Range loop
1320 if not Is_Digit (Character'Val (Value)) then
1321 Set_In_Class (Bitmap, Character'Val (Value));
1326 for Value in Class_Byte'Range loop
1327 if Is_Letter (Character'Val (Value)) then
1328 Set_In_Class (Bitmap, Character'Val (Value));
1332 when ANYOF_NALPHA =>
1333 for Value in Class_Byte'Range loop
1334 if not Is_Letter (Character'Val (Value)) then
1335 Set_In_Class (Bitmap, Character'Val (Value));
1340 for Value in 0 .. 127 loop
1341 Set_In_Class (Bitmap, Character'Val (Value));
1344 when ANYOF_NASCII =>
1345 for Value in 128 .. 255 loop
1346 Set_In_Class (Bitmap, Character'Val (Value));
1350 for Value in Class_Byte'Range loop
1351 if Is_Control (Character'Val (Value)) then
1352 Set_In_Class (Bitmap, Character'Val (Value));
1356 when ANYOF_NCNTRL =>
1357 for Value in Class_Byte'Range loop
1358 if not Is_Control (Character'Val (Value)) then
1359 Set_In_Class (Bitmap, Character'Val (Value));
1364 for Value in Class_Byte'Range loop
1365 if Is_Graphic (Character'Val (Value)) then
1366 Set_In_Class (Bitmap, Character'Val (Value));
1370 when ANYOF_NGRAPH =>
1371 for Value in Class_Byte'Range loop
1372 if not Is_Graphic (Character'Val (Value)) then
1373 Set_In_Class (Bitmap, Character'Val (Value));
1378 for Value in Class_Byte'Range loop
1379 if Is_Lower (Character'Val (Value)) then
1380 Set_In_Class (Bitmap, Character'Val (Value));
1384 when ANYOF_NLOWER =>
1385 for Value in Class_Byte'Range loop
1386 if not Is_Lower (Character'Val (Value)) then
1387 Set_In_Class (Bitmap, Character'Val (Value));
1392 for Value in Class_Byte'Range loop
1393 if Is_Printable (Character'Val (Value)) then
1394 Set_In_Class (Bitmap, Character'Val (Value));
1398 when ANYOF_NPRINT =>
1399 for Value in Class_Byte'Range loop
1400 if not Is_Printable (Character'Val (Value)) then
1401 Set_In_Class (Bitmap, Character'Val (Value));
1406 for Value in Class_Byte'Range loop
1407 if Is_Printable (Character'Val (Value))
1408 and then not Is_White_Space (Character'Val (Value))
1409 and then not Is_Alnum (Character'Val (Value))
1411 Set_In_Class (Bitmap, Character'Val (Value));
1415 when ANYOF_NPUNCT =>
1416 for Value in Class_Byte'Range loop
1417 if not Is_Printable (Character'Val (Value))
1418 or else Is_White_Space (Character'Val (Value))
1419 or else Is_Alnum (Character'Val (Value))
1421 Set_In_Class (Bitmap, Character'Val (Value));
1426 for Value in Class_Byte'Range loop
1427 if Is_Upper (Character'Val (Value)) then
1428 Set_In_Class (Bitmap, Character'Val (Value));
1432 when ANYOF_NUPPER =>
1433 for Value in Class_Byte'Range loop
1434 if not Is_Upper (Character'Val (Value)) then
1435 Set_In_Class (Bitmap, Character'Val (Value));
1439 when ANYOF_XDIGIT =>
1440 for Value in Class_Byte'Range loop
1441 if Is_Hexadecimal_Digit (Character'Val (Value)) then
1442 Set_In_Class (Bitmap, Character'Val (Value));
1446 when ANYOF_NXDIGIT =>
1447 for Value in Class_Byte'Range loop
1448 if not Is_Hexadecimal_Digit
1449 (Character'Val (Value))
1451 Set_In_Class (Bitmap, Character'Val (Value));
1457 -- Not a character range
1459 elsif not In_Range then
1460 Last_Value := Value;
1462 if Expression (Parse_Pos) = '-'
1463 and then Parse_Pos < Parse_End
1464 and then Expression (Parse_Pos + 1) /= ']'
1466 Parse_Pos := Parse_Pos + 1;
1468 -- Do we have a range like '\d-a' and '[:space:]-a'
1469 -- which is not a real range
1471 if Named_Class /= ANYOF_NONE then
1472 Set_In_Class (Bitmap, '-');
1478 Set_In_Class (Bitmap, Value);
1482 -- Else in a character range
1485 if Last_Value > Value then
1486 Fail ("Invalid Range [" & Last_Value'Img
1487 & "-" & Value'Img & "]");
1490 while Last_Value <= Value loop
1491 Set_In_Class (Bitmap, Last_Value);
1492 Last_Value := Character'Succ (Last_Value);
1501 -- Optimize case-insensitive ranges (put the upper case or lower
1502 -- case character into the bitmap)
1504 if (Flags and Case_Insensitive) /= 0 then
1505 for C in Character'Range loop
1506 if Get_From_Class (Bitmap, C) then
1507 Set_In_Class (Bitmap, To_Lower (C));
1508 Set_In_Class (Bitmap, To_Upper (C));
1513 -- Optimize inverted classes
1516 for J in Bitmap'Range loop
1517 Bitmap (J) := not Bitmap (J);
1521 Parse_Pos := Parse_Pos + 1;
1525 IP := Emit_Node (ANYOF);
1526 Emit_Class (Bitmap);
1527 end Parse_Character_Class;
1533 -- This is a bit tricky due to quoted chars and due to
1534 -- the multiplier characters '*', '+', and '?' that
1535 -- take the SINGLE char previous as their operand.
1537 -- On entry, the character at Parse_Pos - 1 is going to go
1538 -- into the string, no matter what it is. It could be
1539 -- following a \ if Parse_Atom was entered from the '\' case.
1541 -- Basic idea is to pick up a good char in C and examine
1542 -- the next char. If Is_Mult (C) then twiddle, if it's a \
1543 -- then frozzle and if it's another magic char then push C and
1544 -- terminate the string. If none of the above, push C on the
1545 -- string and go around again.
1547 -- Start_Pos is used to remember where "the current character"
1548 -- starts in the string, if due to an Is_Mult we need to back
1549 -- up and put the current char in a separate 1-character string.
1550 -- When Start_Pos is 0, C is the only char in the string;
1551 -- this is used in Is_Mult handling, and in setting the SIMPLE
1554 procedure Parse_Literal
1555 (Expr_Flags : out Expression_Flags;
1558 Start_Pos : Natural := 0;
1560 Length_Ptr : Pointer;
1562 Has_Special_Operator : Boolean := False;
1565 Parse_Pos := Parse_Pos - 1; -- Look at current character
1567 if (Flags and Case_Insensitive) /= 0 then
1568 IP := Emit_Node (EXACTF);
1570 IP := Emit_Node (EXACT);
1573 Length_Ptr := Emit_Ptr;
1574 Emit_Ptr := String_Operand (IP);
1578 C := Expression (Parse_Pos); -- Get current character
1581 when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' =>
1583 if Start_Pos = 0 then
1584 Start_Pos := Parse_Pos;
1585 Emit (C); -- First character is always emitted
1587 exit Parse_Loop; -- Else we are done
1590 when '?' | '+' | '*' | '{' =>
1592 if Start_Pos = 0 then
1593 Start_Pos := Parse_Pos;
1594 Emit (C); -- First character is always emitted
1596 -- Are we looking at an operator, or is this
1597 -- simply a normal character ?
1599 elsif not Is_Mult (Parse_Pos) then
1600 Start_Pos := Parse_Pos;
1604 -- We've got something like "abc?d". Mark this as a
1605 -- special case. What we want to emit is a first
1606 -- constant string for "ab", then one for "c" that will
1607 -- ultimately be transformed with a CURLY operator, A
1608 -- special case has to be handled for "a?", since there
1609 -- is no initial string to emit.
1611 Has_Special_Operator := True;
1616 Start_Pos := Parse_Pos;
1618 if Parse_Pos = Parse_End then
1619 Fail ("Trailing \");
1622 case Expression (Parse_Pos + 1) is
1623 when 'b' | 'B' | 's' | 'S' | 'd' | 'D'
1624 | 'w' | 'W' | '0' .. '9' | 'G' | 'A'
1626 when 'n' => Emit (ASCII.LF);
1627 when 't' => Emit (ASCII.HT);
1628 when 'r' => Emit (ASCII.CR);
1629 when 'f' => Emit (ASCII.FF);
1630 when 'e' => Emit (ASCII.ESC);
1631 when 'a' => Emit (ASCII.BEL);
1632 when others => Emit (Expression (Parse_Pos + 1));
1635 Parse_Pos := Parse_Pos + 1;
1639 Start_Pos := Parse_Pos;
1643 exit Parse_Loop when Emit_Ptr - Length_Ptr = 254;
1645 Parse_Pos := Parse_Pos + 1;
1647 exit Parse_Loop when Parse_Pos > Parse_End;
1648 end loop Parse_Loop;
1650 -- Is the string followed by a '*+?{' operator ? If yes, and if there
1651 -- is an initial string to emit, do it now.
1653 if Has_Special_Operator
1654 and then Emit_Ptr >= Length_Ptr + 3
1656 Emit_Ptr := Emit_Ptr - 1;
1657 Parse_Pos := Start_Pos;
1661 Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2);
1664 Expr_Flags.Has_Width := True;
1666 -- Slight optimization when there is a single character
1668 if Emit_Ptr = Length_Ptr + 2 then
1669 Expr_Flags.Simple := True;
1677 -- Note that the branching code sequences used for '?' and the
1678 -- general cases of '*' and + are somewhat optimized: they use
1679 -- the same NOTHING node as both the endmarker for their branch
1680 -- list and the body of the last branch. It might seem that
1681 -- this node could be dispensed with entirely, but the endmarker
1682 -- role is not redundant.
1684 procedure Parse_Piece
1685 (Expr_Flags : out Expression_Flags;
1689 New_Flags : Expression_Flags;
1690 Greedy : Boolean := True;
1693 Parse_Atom (New_Flags, IP);
1699 if Parse_Pos > Parse_End
1700 or else not Is_Mult (Parse_Pos)
1702 Expr_Flags := New_Flags;
1706 Op := Expression (Parse_Pos);
1709 Expr_Flags := (SP_Start => True, others => False);
1711 Expr_Flags := (Has_Width => True, others => False);
1714 -- Detect non greedy operators in the easy cases
1717 and then Parse_Pos + 1 <= Parse_End
1718 and then Expression (Parse_Pos + 1) = '?'
1721 Parse_Pos := Parse_Pos + 1;
1724 -- Generate the byte code
1729 if New_Flags.Simple then
1730 Insert_Operator (STAR, IP, Greedy);
1732 Link_Tail (IP, Emit_Node (WHILEM));
1733 Insert_Curly_Operator
1734 (CURLYX, 0, Max_Curly_Repeat, IP, Greedy);
1735 Link_Tail (IP, Emit_Node (NOTHING));
1740 if New_Flags.Simple then
1741 Insert_Operator (PLUS, IP, Greedy);
1743 Link_Tail (IP, Emit_Node (WHILEM));
1744 Insert_Curly_Operator
1745 (CURLYX, 1, Max_Curly_Repeat, IP, Greedy);
1746 Link_Tail (IP, Emit_Node (NOTHING));
1750 if New_Flags.Simple then
1751 Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy);
1753 Link_Tail (IP, Emit_Node (WHILEM));
1754 Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy);
1755 Link_Tail (IP, Emit_Node (NOTHING));
1763 Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy);
1765 if New_Flags.Simple then
1766 Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy);
1768 Link_Tail (IP, Emit_Node (WHILEM));
1769 Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy);
1770 Link_Tail (IP, Emit_Node (NOTHING));
1778 Parse_Pos := Parse_Pos + 1;
1780 if Parse_Pos <= Parse_End
1781 and then Is_Mult (Parse_Pos)
1783 Fail ("nested *+{");
1787 ---------------------------------
1788 -- Parse_Posix_Character_Class --
1789 ---------------------------------
1791 function Parse_Posix_Character_Class return Std_Class is
1792 Invert : Boolean := False;
1793 Class : Std_Class := ANYOF_NONE;
1794 E : String renames Expression;
1796 -- Class names. Note that code assumes that the length of all
1797 -- classes starting with the same letter have the same length.
1799 Alnum : constant String := "alnum:]";
1800 Alpha : constant String := "alpha:]";
1801 Ascii_C : constant String := "ascii:]";
1802 Cntrl : constant String := "cntrl:]";
1803 Digit : constant String := "digit:]";
1804 Graph : constant String := "graph:]";
1805 Lower : constant String := "lower:]";
1806 Print : constant String := "print:]";
1807 Punct : constant String := "punct:]";
1808 Space : constant String := "space:]";
1809 Upper : constant String := "upper:]";
1810 Word : constant String := "word:]";
1811 Xdigit : constant String := "xdigit:]";
1814 -- Case of character class specified
1816 if Parse_Pos <= Parse_End
1817 and then Expression (Parse_Pos) = ':'
1819 Parse_Pos := Parse_Pos + 1;
1821 -- Do we have something like: [[:^alpha:]]
1823 if Parse_Pos <= Parse_End
1824 and then Expression (Parse_Pos) = '^'
1827 Parse_Pos := Parse_Pos + 1;
1830 -- Check for class names based on first letter
1832 case Expression (Parse_Pos) is
1836 -- All 'a' classes have the same length (Alnum'Length)
1838 if Parse_Pos + Alnum'Length - 1 <= Parse_End then
1840 if E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) =
1844 Class := ANYOF_NALNUMC;
1846 Class := ANYOF_ALNUMC;
1849 Parse_Pos := Parse_Pos + Alnum'Length;
1851 elsif E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) =
1855 Class := ANYOF_NALPHA;
1857 Class := ANYOF_ALPHA;
1860 Parse_Pos := Parse_Pos + Alpha'Length;
1862 elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) =
1866 Class := ANYOF_NASCII;
1868 Class := ANYOF_ASCII;
1871 Parse_Pos := Parse_Pos + Ascii_C'Length;
1876 if Parse_Pos + Cntrl'Length - 1 <= Parse_End
1877 and then E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) =
1881 Class := ANYOF_NCNTRL;
1883 Class := ANYOF_CNTRL;
1886 Parse_Pos := Parse_Pos + Cntrl'Length;
1890 if Parse_Pos + Digit'Length - 1 <= Parse_End
1891 and then E (Parse_Pos .. Parse_Pos + Digit'Length - 1) =
1895 Class := ANYOF_NDIGIT;
1897 Class := ANYOF_DIGIT;
1900 Parse_Pos := Parse_Pos + Digit'Length;
1904 if Parse_Pos + Graph'Length - 1 <= Parse_End
1905 and then E (Parse_Pos .. Parse_Pos + Graph'Length - 1) =
1909 Class := ANYOF_NGRAPH;
1911 Class := ANYOF_GRAPH;
1913 Parse_Pos := Parse_Pos + Graph'Length;
1917 if Parse_Pos + Lower'Length - 1 <= Parse_End
1918 and then E (Parse_Pos .. Parse_Pos + Lower'Length - 1) =
1922 Class := ANYOF_NLOWER;
1924 Class := ANYOF_LOWER;
1926 Parse_Pos := Parse_Pos + Lower'Length;
1931 -- All 'p' classes have the same length
1933 if Parse_Pos + Print'Length - 1 <= Parse_End then
1934 if E (Parse_Pos .. Parse_Pos + Print'Length - 1) =
1938 Class := ANYOF_NPRINT;
1940 Class := ANYOF_PRINT;
1943 Parse_Pos := Parse_Pos + Print'Length;
1945 elsif E (Parse_Pos .. Parse_Pos + Punct'Length - 1) =
1949 Class := ANYOF_NPUNCT;
1951 Class := ANYOF_PUNCT;
1954 Parse_Pos := Parse_Pos + Punct'Length;
1959 if Parse_Pos + Space'Length - 1 <= Parse_End
1960 and then E (Parse_Pos .. Parse_Pos + Space'Length - 1) =
1964 Class := ANYOF_NSPACE;
1966 Class := ANYOF_SPACE;
1969 Parse_Pos := Parse_Pos + Space'Length;
1974 if Parse_Pos + Upper'Length - 1 <= Parse_End
1975 and then E (Parse_Pos .. Parse_Pos + Upper'Length - 1) =
1979 Class := ANYOF_NUPPER;
1981 Class := ANYOF_UPPER;
1983 Parse_Pos := Parse_Pos + Upper'Length;
1988 if Parse_Pos + Word'Length - 1 <= Parse_End
1989 and then E (Parse_Pos .. Parse_Pos + Word'Length - 1) =
1993 Class := ANYOF_NALNUM;
1995 Class := ANYOF_ALNUM;
1997 Parse_Pos := Parse_Pos + Word'Length;
2002 if Parse_Pos + Xdigit'Length - 1 <= Parse_End
2003 and then E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1)
2007 Class := ANYOF_NXDIGIT;
2009 Class := ANYOF_XDIGIT;
2012 Parse_Pos := Parse_Pos + Xdigit'Length;
2016 Fail ("Invalid character class");
2019 -- Character class not specified
2026 end Parse_Posix_Character_Class;
2028 Expr_Flags : Expression_Flags;
2031 -- Start of processing for Compile
2035 Parse (False, Expr_Flags, Result);
2038 Fail ("Couldn't compile expression");
2041 Final_Code_Size := Emit_Ptr - 1;
2043 -- Do we want to actually compile the expression, or simply get the
2054 (Expression : String;
2055 Flags : Regexp_Flags := No_Flags)
2056 return Pattern_Matcher
2058 Size : Program_Size;
2059 Dummy : Pattern_Matcher (0);
2062 Compile (Dummy, Expression, Size, Flags);
2065 Result : Pattern_Matcher (Size);
2067 Compile (Result, Expression, Size, Flags);
2073 (Matcher : out Pattern_Matcher;
2074 Expression : String;
2075 Flags : Regexp_Flags := No_Flags)
2077 Size : Program_Size;
2080 Compile (Matcher, Expression, Size, Flags);
2087 procedure Dump (Self : Pattern_Matcher) is
2089 -- Index : Pointer := Program_First + 1;
2090 -- What is the above line for ???
2093 Program : Program_Data renames Self.Program;
2095 procedure Dump_Until
2098 Indent : Natural := 0);
2099 -- Dump the program until the node Till (not included) is met.
2100 -- Every line is indented with Index spaces at the beginning
2101 -- Dumps till the end if Till is 0.
2107 procedure Dump_Until
2110 Indent : Natural := 0)
2113 Index : Pointer := Start;
2114 Local_Indent : Natural := Indent;
2118 while Index < Till loop
2120 Op := Opcode'Val (Character'Pos ((Self.Program (Index))));
2123 Local_Indent := Local_Indent - 3;
2127 Point : constant String := Pointer'Image (Index);
2130 for J in 1 .. 6 - Point'Length loop
2136 & (1 .. Local_Indent => ' ')
2137 & Opcode'Image (Op));
2140 -- Print the parenthesis number
2142 if Op = OPEN or else Op = CLOSE or else Op = REFF then
2143 Put (Natural'Image (Character'Pos (Program (Index + 3))));
2146 Next := Index + Get_Next_Offset (Program, Index);
2148 if Next = Index then
2149 Put (" (next at 0)");
2151 Put (" (next at " & Pointer'Image (Next) & ")");
2156 -- Character class operand
2160 Bitmap : Character_Class;
2161 Last : Character := ASCII.Nul;
2162 Current : Natural := 0;
2164 Current_Char : Character;
2167 Bitmap_Operand (Program, Index, Bitmap);
2170 while Current <= 255 loop
2171 Current_Char := Character'Val (Current);
2173 -- First item in a range
2175 if Get_From_Class (Bitmap, Current_Char) then
2176 Last := Current_Char;
2178 -- Search for the last item in the range
2181 Current := Current + 1;
2182 exit when Current > 255;
2183 Current_Char := Character'Val (Current);
2185 not Get_From_Class (Bitmap, Current_Char);
2195 if Character'Succ (Last) /= Current_Char then
2196 Put ("-" & Character'Pred (Current_Char));
2200 Current := Current + 1;
2205 Index := Index + 3 + Bitmap'Length;
2210 when EXACT | EXACTF =>
2211 Length := String_Length (Program, Index);
2212 Put (" operand (length:" & Program_Size'Image (Length + 1)
2214 & String (Program (String_Operand (Index)
2215 .. String_Operand (Index)
2217 Index := String_Operand (Index) + Length + 1;
2224 Dump_Until (Index + 3, Next, Local_Indent + 3);
2230 -- Only one instruction
2232 Dump_Until (Index + 3, Index + 4, Local_Indent + 3);
2235 when CURLY | CURLYX =>
2237 & Natural'Image (Read_Natural (Program, Index + 3))
2239 & Natural'Image (Read_Natural (Program, Index + 5))
2242 Dump_Until (Index + 7, Next, Local_Indent + 3);
2248 Local_Indent := Local_Indent + 3;
2250 when CLOSE | REFF =>
2268 -- Start of processing for Dump
2271 pragma Assert (Self.Program (Program_First) = MAGIC,
2272 "Corrupted Pattern_Matcher");
2274 Put_Line ("Must start with (Self.First) = "
2275 & Character'Image (Self.First));
2277 if (Self.Flags and Case_Insensitive) /= 0 then
2278 Put_Line (" Case_Insensitive mode");
2281 if (Self.Flags and Single_Line) /= 0 then
2282 Put_Line (" Single_Line mode");
2285 if (Self.Flags and Multiple_Lines) /= 0 then
2286 Put_Line (" Multiple_Lines mode");
2289 Put_Line (" 1 : MAGIC");
2290 Dump_Until (Program_First + 1, Self.Program'Last + 1);
2293 --------------------
2294 -- Get_From_Class --
2295 --------------------
2297 function Get_From_Class
2298 (Bitmap : Character_Class;
2302 Value : constant Class_Byte := Character'Pos (C);
2306 (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0;
2313 function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is
2314 Offset : constant Pointer := Get_Next_Offset (Program, IP);
2324 ---------------------
2325 -- Get_Next_Offset --
2326 ---------------------
2328 function Get_Next_Offset
2329 (Program : Program_Data;
2334 return Pointer (Read_Natural (Program, IP + 1));
2335 end Get_Next_Offset;
2341 function Is_Alnum (C : Character) return Boolean is
2343 return Is_Alphanumeric (C) or else C = '_';
2350 function Is_Printable (C : Character) return Boolean is
2352 -- Printable if space or graphic character or other whitespace
2353 -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13)
2355 return C in Character'Val (32) .. Character'Val (126)
2356 or else C in ASCII.HT .. ASCII.CR;
2359 --------------------
2360 -- Is_White_Space --
2361 --------------------
2363 function Is_White_Space (C : Character) return Boolean is
2365 -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13
2367 return C = ' ' or else C in ASCII.HT .. ASCII.CR;
2375 (Self : Pattern_Matcher;
2377 Matches : out Match_Array;
2378 Data_First : Integer := -1;
2379 Data_Last : Positive := Positive'Last)
2381 Program : Program_Data renames Self.Program; -- Shorter notation
2383 First_In_Data : constant Integer := Integer'Max (Data_First, Data'First);
2384 Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last);
2386 -- Global work variables
2388 Input_Pos : Natural; -- String-input pointer
2389 BOL_Pos : Natural; -- Beginning of input, for ^ check
2390 Matched : Boolean := False; -- Until proven True
2392 Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count,
2394 -- Stores the value of all the parenthesis pairs.
2395 -- We do not use directly Matches, so that we can also use back
2396 -- references (REFF) even if Matches is too small.
2398 type Natural_Array is array (Match_Count range <>) of Natural;
2399 Matches_Tmp : Natural_Array (Matches_Full'Range);
2400 -- Save the opening position of parenthesis.
2402 Last_Paren : Natural := 0;
2403 -- Last parenthesis seen
2405 Greedy : Boolean := True;
2406 -- True if the next operator should be greedy
2408 type Current_Curly_Record;
2409 type Current_Curly_Access is access all Current_Curly_Record;
2410 type Current_Curly_Record is record
2411 Paren_Floor : Natural; -- How far back to strip parenthesis data
2412 Cur : Integer; -- How many instances of scan we've matched
2413 Min : Natural; -- Minimal number of scans to match
2414 Max : Natural; -- Maximal number of scans to match
2415 Greedy : Boolean; -- Whether to work our way up or down
2416 Scan : Pointer; -- The thing to match
2417 Next : Pointer; -- What has to match after it
2418 Lastloc : Natural; -- Where we started matching this scan
2419 Old_Cc : Current_Curly_Access; -- Before we started this one
2421 -- Data used to handle the curly operator and the plus and star
2422 -- operators for complex expressions.
2424 Current_Curly : Current_Curly_Access := null;
2425 -- The curly currently being processed.
2427 -----------------------
2428 -- Local Subprograms --
2429 -----------------------
2431 function Index (Start : Positive; C : Character) return Natural;
2432 -- Find character C in Data starting at Start and return position
2436 Max : Natural := Natural'Last)
2438 -- Repeatedly match something simple, report how many
2439 -- It only matches on things of length 1.
2440 -- Starting from Input_Pos, it matches at most Max CURLY.
2442 function Try (Pos : in Positive) return Boolean;
2443 -- Try to match at specific point
2445 function Match (IP : Pointer) return Boolean;
2446 -- This is the main matching routine. Conceptually the strategy
2447 -- is simple: check to see whether the current node matches,
2448 -- call self recursively to see whether the rest matches,
2449 -- and then act accordingly.
2451 -- In practice Match makes some effort to avoid recursion, in
2452 -- particular by going through "ordinary" nodes (that don't
2453 -- need to know whether the rest of the match failed) by
2454 -- using a loop instead of recursion.
2455 -- Why is the above comment part of the spec rather than body ???
2457 function Match_Whilem (IP : Pointer) return Boolean;
2458 -- Return True if a WHILEM matches
2459 -- How come IP is unreferenced in the body ???
2461 function Recurse_Match (IP : Pointer; From : Natural) return Boolean;
2462 pragma Inline (Recurse_Match);
2463 -- Calls Match recursively. It saves and restores the parenthesis
2464 -- status and location in the input stream correctly, so that
2465 -- backtracking is possible
2467 function Match_Simple_Operator
2473 -- Return True it the simple operator (possibly non-greedy) matches
2475 pragma Inline (Index);
2476 pragma Inline (Repeat);
2478 -- These are two complex functions, but used only once.
2480 pragma Inline (Match_Whilem);
2481 pragma Inline (Match_Simple_Operator);
2493 for J in Start .. Last_In_Data loop
2494 if Data (J) = C then
2506 function Recurse_Match (IP : Pointer; From : Natural) return Boolean is
2507 L : constant Natural := Last_Paren;
2509 Tmp_F : constant Match_Array :=
2510 Matches_Full (From + 1 .. Matches_Full'Last);
2512 Start : constant Natural_Array :=
2513 Matches_Tmp (From + 1 .. Matches_Tmp'Last);
2514 Input : constant Natural := Input_Pos;
2522 Matches_Full (Tmp_F'Range) := Tmp_F;
2523 Matches_Tmp (Start'Range) := Start;
2532 function Match (IP : Pointer) return Boolean is
2533 Scan : Pointer := IP;
2540 pragma Assert (Scan /= 0);
2542 -- Determine current opcode and count its usage in debug mode
2544 Op := Opcode'Val (Character'Pos (Program (Scan)));
2546 -- Calculate offset of next instruction.
2547 -- Second character is most significant in Program_Data.
2549 Next := Get_Next (Program, Scan);
2553 return True; -- Success !
2556 if Program (Next) /= BRANCH then
2557 Next := Operand (Scan); -- No choice, avoid recursion
2561 if Recurse_Match (Operand (Scan), 0) then
2565 Scan := Get_Next (Program, Scan);
2566 exit when Scan = 0 or Program (Scan) /= BRANCH;
2576 exit State_Machine when Input_Pos /= BOL_Pos
2577 and then ((Self.Flags and Multiple_Lines) = 0
2578 or else Data (Input_Pos - 1) /= ASCII.LF);
2581 exit State_Machine when Input_Pos /= BOL_Pos
2582 and then Data (Input_Pos - 1) /= ASCII.LF;
2585 exit State_Machine when Input_Pos /= BOL_Pos;
2588 exit State_Machine when Input_Pos <= Data'Last
2589 and then ((Self.Flags and Multiple_Lines) = 0
2590 or else Data (Input_Pos) /= ASCII.LF);
2593 exit State_Machine when Input_Pos <= Data'Last
2594 and then Data (Input_Pos) /= ASCII.LF;
2597 exit State_Machine when Input_Pos <= Data'Last;
2599 when BOUND | NBOUND =>
2601 -- Was last char in word ?
2604 N : Boolean := False;
2605 Ln : Boolean := False;
2608 if Input_Pos /= First_In_Data then
2609 N := Is_Alnum (Data (Input_Pos - 1));
2612 if Input_Pos > Last_In_Data then
2615 Ln := Is_Alnum (Data (Input_Pos));
2630 exit State_Machine when Input_Pos > Last_In_Data
2631 or else not Is_White_Space (Data (Input_Pos));
2632 Input_Pos := Input_Pos + 1;
2635 exit State_Machine when Input_Pos > Last_In_Data
2636 or else Is_White_Space (Data (Input_Pos));
2637 Input_Pos := Input_Pos + 1;
2640 exit State_Machine when Input_Pos > Last_In_Data
2641 or else not Is_Digit (Data (Input_Pos));
2642 Input_Pos := Input_Pos + 1;
2645 exit State_Machine when Input_Pos > Last_In_Data
2646 or else Is_Digit (Data (Input_Pos));
2647 Input_Pos := Input_Pos + 1;
2650 exit State_Machine when Input_Pos > Last_In_Data
2651 or else not Is_Alnum (Data (Input_Pos));
2652 Input_Pos := Input_Pos + 1;
2655 exit State_Machine when Input_Pos > Last_In_Data
2656 or else Is_Alnum (Data (Input_Pos));
2657 Input_Pos := Input_Pos + 1;
2660 exit State_Machine when Input_Pos > Last_In_Data
2661 or else Data (Input_Pos) = ASCII.LF;
2662 Input_Pos := Input_Pos + 1;
2665 exit State_Machine when Input_Pos > Last_In_Data;
2666 Input_Pos := Input_Pos + 1;
2670 Opnd : Pointer := String_Operand (Scan);
2671 Current : Positive := Input_Pos;
2673 Last : constant Pointer :=
2674 Opnd + String_Length (Program, Scan);
2677 while Opnd <= Last loop
2678 exit State_Machine when Current > Last_In_Data
2679 or else Program (Opnd) /= Data (Current);
2680 Current := Current + 1;
2684 Input_Pos := Current;
2689 Opnd : Pointer := String_Operand (Scan);
2690 Current : Positive := Input_Pos;
2692 Last : constant Pointer :=
2693 Opnd + String_Length (Program, Scan);
2696 while Opnd <= Last loop
2697 exit State_Machine when Current > Last_In_Data
2698 or else Program (Opnd) /= To_Lower (Data (Current));
2699 Current := Current + 1;
2703 Input_Pos := Current;
2708 Bitmap : Character_Class;
2711 Bitmap_Operand (Program, Scan, Bitmap);
2712 exit State_Machine when Input_Pos > Last_In_Data
2713 or else not Get_From_Class (Bitmap, Data (Input_Pos));
2714 Input_Pos := Input_Pos + 1;
2719 No : constant Natural :=
2720 Character'Pos (Program (Operand (Scan)));
2723 Matches_Tmp (No) := Input_Pos;
2728 No : constant Natural :=
2729 Character'Pos (Program (Operand (Scan)));
2732 Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1);
2734 if Last_Paren < No then
2741 No : constant Natural :=
2742 Character'Pos (Program (Operand (Scan)));
2747 -- If we haven't seen that parenthesis yet
2749 if Last_Paren < No then
2753 Data_Pos := Matches_Full (No).First;
2755 while Data_Pos <= Matches_Full (No).Last loop
2756 if Input_Pos > Last_In_Data
2757 or else Data (Input_Pos) /= Data (Data_Pos)
2762 Input_Pos := Input_Pos + 1;
2763 Data_Pos := Data_Pos + 1;
2770 when STAR | PLUS | CURLY =>
2772 Greed : constant Boolean := Greedy;
2776 return Match_Simple_Operator (Op, Scan, Next, Greed);
2781 -- Looking at something like:
2783 -- 1: CURLYX {n,m} (->4)
2784 -- 2: code for complex thing (->3)
2789 Min : constant Natural :=
2790 Read_Natural (Program, Scan + 3);
2791 Max : constant Natural :=
2792 Read_Natural (Program, Scan + 5);
2793 Cc : aliased Current_Curly_Record;
2795 Has_Match : Boolean;
2798 Cc := (Paren_Floor => Last_Paren,
2806 Old_Cc => Current_Curly);
2807 Current_Curly := Cc'Unchecked_Access;
2809 Has_Match := Match (Next - 3);
2811 -- Start on the WHILEM
2813 Current_Curly := Cc.Old_Cc;
2818 return Match_Whilem (IP);
2822 end loop State_Machine;
2824 -- If we get here, there is no match.
2825 -- For successful matches when EOP is the terminating point.
2830 ---------------------------
2831 -- Match_Simple_Operator --
2832 ---------------------------
2834 function Match_Simple_Operator
2841 Next_Char : Character := ASCII.Nul;
2842 Next_Char_Known : Boolean := False;
2843 No : Integer; -- Can be negative
2845 Max : Natural := Natural'Last;
2846 Operand_Code : Pointer;
2849 Save : constant Natural := Input_Pos;
2852 -- Lookahead to avoid useless match attempts
2853 -- when we know what character comes next.
2855 if Program (Next) = EXACT then
2856 Next_Char := Program (String_Operand (Next));
2857 Next_Char_Known := True;
2860 -- Find the minimal and maximal values for the operator
2865 Operand_Code := Operand (Scan);
2869 Operand_Code := Operand (Scan);
2872 Min := Read_Natural (Program, Scan + 3);
2873 Max := Read_Natural (Program, Scan + 5);
2874 Operand_Code := Scan + 7;
2877 -- Non greedy operators
2881 -- Test the minimal repetitions
2884 and then Repeat (Operand_Code, Min) < Min
2891 -- Find the place where 'next' could work
2893 if Next_Char_Known then
2894 -- Last position to check
2896 Last_Pos := Input_Pos + Max;
2898 if Last_Pos > Last_In_Data
2899 or else Max = Natural'Last
2901 Last_Pos := Last_In_Data;
2904 -- Look for the first possible opportunity
2907 -- Find the next possible position
2909 while Input_Pos <= Last_Pos
2910 and then Data (Input_Pos) /= Next_Char
2912 Input_Pos := Input_Pos + 1;
2915 if Input_Pos > Last_Pos then
2919 -- Check that we still match if we stop
2920 -- at the position we just found.
2923 Num : constant Natural := Input_Pos - Old;
2928 if Repeat (Operand_Code, Num) < Num then
2933 -- Input_Pos now points to the new position
2935 if Match (Get_Next (Program, Scan)) then
2940 Input_Pos := Input_Pos + 1;
2943 -- We know what the next character is
2946 while Max >= Min loop
2948 -- If the next character matches
2950 if Match (Next) then
2954 Input_Pos := Save + Min;
2956 -- Could not or did not match -- move forward
2958 if Repeat (Operand_Code, 1) /= 0 then
2971 No := Repeat (Operand_Code, Max);
2973 -- ??? Perl has some special code here in case the
2974 -- next instruction is of type EOL, since $ and \Z
2975 -- can match before *and* after newline at the end.
2977 -- ??? Perl has some special code here in case (paren)
2980 -- Else, if we don't have any parenthesis
2982 while No >= Min loop
2983 if not Next_Char_Known
2984 or else (Input_Pos <= Last_In_Data
2985 and then Data (Input_Pos) = Next_Char)
2987 if Match (Next) then
2992 -- Could not or did not work, we back up
2995 Input_Pos := Save + No;
3000 end Match_Simple_Operator;
3006 -- This is really hard to understand, because after we match what we
3007 -- are trying to match, we must make sure the rest of the REx is going
3008 -- to match for sure, and to do that we have to go back UP the parse
3009 -- tree by recursing ever deeper. And if it fails, we have to reset
3010 -- our parent's current state that we can try again after backing off.
3012 function Match_Whilem (IP : Pointer) return Boolean is
3013 pragma Unreferenced (IP);
3015 Cc : Current_Curly_Access := Current_Curly;
3016 N : constant Natural := Cc.Cur + 1;
3019 Lastloc : constant Natural := Cc.Lastloc;
3020 -- Detection of 0-len.
3023 -- If degenerate scan matches "", assume scan done.
3025 if Input_Pos = Cc.Lastloc
3026 and then N >= Cc.Min
3028 -- Temporarily restore the old context, and check that we
3029 -- match was comes after CURLYX.
3031 Current_Curly := Cc.Old_Cc;
3033 if Current_Curly /= null then
3034 Ln := Current_Curly.Cur;
3037 if Match (Cc.Next) then
3041 if Current_Curly /= null then
3042 Current_Curly.Cur := Ln;
3045 Current_Curly := Cc;
3049 -- First, just match a string of min scans.
3053 Cc.Lastloc := Input_Pos;
3055 if Match (Cc.Scan) then
3060 Cc.Lastloc := Lastloc;
3064 -- Prefer next over scan for minimal matching.
3066 if not Cc.Greedy then
3067 Current_Curly := Cc.Old_Cc;
3069 if Current_Curly /= null then
3070 Ln := Current_Curly.Cur;
3073 if Recurse_Match (Cc.Next, Cc.Paren_Floor) then
3077 if Current_Curly /= null then
3078 Current_Curly.Cur := Ln;
3081 Current_Curly := Cc;
3083 -- Maximum greed exceeded ?
3089 -- Try scanning more and see if it helps
3091 Cc.Lastloc := Input_Pos;
3093 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then
3098 Cc.Lastloc := Lastloc;
3102 -- Prefer scan over next for maximal matching
3104 if N < Cc.Max then -- more greed allowed ?
3106 Cc.Lastloc := Input_Pos;
3108 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then
3113 -- Failed deeper matches of scan, so see if this one works
3115 Current_Curly := Cc.Old_Cc;
3117 if Current_Curly /= null then
3118 Ln := Current_Curly.Cur;
3121 if Match (Cc.Next) then
3125 if Current_Curly /= null then
3126 Current_Curly.Cur := Ln;
3129 Current_Curly := Cc;
3131 Cc.Lastloc := Lastloc;
3141 Max : Natural := Natural'Last)
3144 Scan : Natural := Input_Pos;
3146 Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));
3149 Is_First : Boolean := True;
3150 Bitmap : Character_Class;
3153 if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then
3154 Last := Last_In_Data;
3156 Last := Scan + Max - 1;
3162 and then Data (Scan) /= ASCII.LF
3172 -- The string has only one character if Repeat was called
3174 C := Program (String_Operand (IP));
3176 and then C = Data (Scan)
3183 -- The string has only one character if Repeat was called
3185 C := Program (String_Operand (IP));
3187 and then To_Lower (C) = Data (Scan)
3194 Bitmap_Operand (Program, IP, Bitmap);
3199 and then Get_From_Class (Bitmap, Data (Scan))
3206 and then Is_Alnum (Data (Scan))
3213 and then not Is_Alnum (Data (Scan))
3220 and then Is_White_Space (Data (Scan))
3227 and then not Is_White_Space (Data (Scan))
3234 and then Is_Digit (Data (Scan))
3241 and then not Is_Digit (Data (Scan))
3247 raise Program_Error;
3250 Count := Scan - Input_Pos;
3259 function Try (Pos : in Positive) return Boolean is
3263 Matches_Full := (others => No_Match);
3265 if Match (Program_First + 1) then
3266 Matches_Full (0) := (Pos, Input_Pos - 1);
3273 -- Start of processing for Match
3276 -- Do we have the regexp Never_Match?
3278 if Self.Size = 0 then
3279 Matches (0) := No_Match;
3283 -- Check validity of program
3286 (Program (Program_First) = MAGIC,
3287 "Corrupted Pattern_Matcher");
3289 -- If there is a "must appear" string, look for it
3291 if Self.Must_Have_Length > 0 then
3293 First : constant Character := Program (Self.Must_Have);
3294 Must_First : constant Pointer := Self.Must_Have;
3295 Must_Last : constant Pointer :=
3296 Must_First + Pointer (Self.Must_Have_Length - 1);
3297 Next_Try : Natural := Index (First_In_Data, First);
3301 and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1)
3302 = String (Program (Must_First .. Must_Last))
3304 Next_Try := Index (Next_Try + 1, First);
3307 if Next_Try = 0 then
3308 Matches_Full := (others => No_Match);
3309 return; -- Not present
3314 -- Mark beginning of line for ^
3316 BOL_Pos := Data'First;
3318 -- Simplest case first: an anchored match need be tried only once
3320 if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then
3321 Matched := Try (First_In_Data);
3323 elsif Self.Anchored then
3325 Next_Try : Natural := First_In_Data;
3327 -- Test the first position in the buffer
3328 Matched := Try (Next_Try);
3330 -- Else only test after newlines
3333 while Next_Try <= Last_In_Data loop
3334 while Next_Try <= Last_In_Data
3335 and then Data (Next_Try) /= ASCII.LF
3337 Next_Try := Next_Try + 1;
3340 Next_Try := Next_Try + 1;
3342 if Next_Try <= Last_In_Data then
3343 Matched := Try (Next_Try);
3350 elsif Self.First /= ASCII.NUL then
3351 -- We know what char it must start with
3354 Next_Try : Natural := Index (First_In_Data, Self.First);
3357 while Next_Try /= 0 loop
3358 Matched := Try (Next_Try);
3360 Next_Try := Index (Next_Try + 1, Self.First);
3365 -- Messy cases: try all locations (including for the empty string)
3367 Matched := Try (First_In_Data);
3370 for S in First_In_Data + 1 .. Last_In_Data loop
3377 -- Matched has its value
3379 for J in Last_Paren + 1 .. Matches'Last loop
3380 Matches_Full (J) := No_Match;
3383 Matches := Matches_Full (Matches'Range);
3388 (Self : Pattern_Matcher;
3390 Data_First : Integer := -1;
3391 Data_Last : Positive := Positive'Last)
3394 Matches : Match_Array (0 .. 0);
3397 Match (Self, Data, Matches, Data_First, Data_Last);
3398 if Matches (0) = No_Match then
3399 return Data'First - 1;
3401 return Matches (0).First;
3406 (Self : Pattern_Matcher;
3408 Data_First : Integer := -1;
3409 Data_Last : Positive := Positive'Last)
3412 Matches : Match_Array (0 .. 0);
3415 Match (Self, Data, Matches, Data_First, Data_Last);
3416 return Matches (0).First >= Data'First;
3420 (Expression : String;
3422 Matches : out Match_Array;
3423 Size : Program_Size := 0;
3424 Data_First : Integer := -1;
3425 Data_Last : Positive := Positive'Last)
3427 PM : Pattern_Matcher (Size);
3428 Finalize_Size : Program_Size;
3432 Match (Compile (Expression), Data, Matches, Data_First, Data_Last);
3434 Compile (PM, Expression, Finalize_Size);
3435 Match (PM, Data, Matches, Data_First, Data_Last);
3440 (Expression : String;
3442 Size : Program_Size := 0;
3443 Data_First : Integer := -1;
3444 Data_Last : Positive := Positive'Last)
3447 PM : Pattern_Matcher (Size);
3448 Final_Size : Program_Size; -- unused
3452 return Match (Compile (Expression), Data, Data_First, Data_Last);
3454 Compile (PM, Expression, Final_Size);
3455 return Match (PM, Data, Data_First, Data_Last);
3460 (Expression : String;
3462 Size : Program_Size := 0;
3463 Data_First : Integer := -1;
3464 Data_Last : Positive := Positive'Last)
3467 Matches : Match_Array (0 .. 0);
3468 PM : Pattern_Matcher (Size);
3469 Final_Size : Program_Size; -- unused
3473 Match (Compile (Expression), Data, Matches, Data_First, Data_Last);
3475 Compile (PM, Expression, Final_Size);
3476 Match (PM, Data, Matches, Data_First, Data_Last);
3479 return Matches (0).First >= Data'First;
3486 function Operand (P : Pointer) return Pointer is
3495 procedure Optimize (Self : in out Pattern_Matcher) is
3496 Max_Length : Program_Size;
3497 This_Length : Program_Size;
3500 Program : Program_Data renames Self.Program;
3503 -- Start with safe defaults (no optimization):
3504 -- * No known first character of match
3505 -- * Does not necessarily start at beginning of line
3506 -- * No string known that has to appear in data
3508 Self.First := ASCII.NUL;
3509 Self.Anchored := False;
3510 Self.Must_Have := Program'Last + 1;
3511 Self.Must_Have_Length := 0;
3513 Scan := Program_First + 1; -- First instruction (can be anything)
3515 if Program (Scan) = EXACT then
3516 Self.First := Program (String_Operand (Scan));
3518 elsif Program (Scan) = BOL
3519 or else Program (Scan) = SBOL
3520 or else Program (Scan) = MBOL
3522 Self.Anchored := True;
3525 -- If there's something expensive in the regexp, find the
3526 -- longest literal string that must appear and make it the
3527 -- regmust. Resolve ties in favor of later strings, since
3528 -- the regstart check works with the beginning of the regexp.
3529 -- and avoiding duplication strengthens checking. Not a
3530 -- strong reason, but sufficient in the absence of others.
3532 if False then -- if Flags.SP_Start then ???
3535 while Scan /= 0 loop
3536 if Program (Scan) = EXACT or else Program (Scan) = EXACTF then
3537 This_Length := String_Length (Program, Scan);
3539 if This_Length >= Max_Length then
3540 Longest := String_Operand (Scan);
3541 Max_Length := This_Length;
3545 Scan := Get_Next (Program, Scan);
3548 Self.Must_Have := Longest;
3549 Self.Must_Have_Length := Natural (Max_Length) + 1;
3557 function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is
3559 return Regexp.Paren_Count;
3566 function Quote (Str : String) return String is
3567 S : String (1 .. Str'Length * 2);
3568 Last : Natural := 0;
3571 for J in Str'Range loop
3573 when '^' | '$' | '|' | '*' | '+' | '?' | '{' |
3574 '}' | '[' | ']' | '(' | ')' | '\' =>
3576 S (Last + 1) := '\';
3577 S (Last + 2) := Str (J);
3581 S (Last + 1) := Str (J);
3586 return S (1 .. Last);
3593 function Read_Natural
3594 (Program : Program_Data;
3599 return Character'Pos (Program (IP)) +
3600 256 * Character'Pos (Program (IP + 1));
3607 procedure Reset_Class (Bitmap : out Character_Class) is
3609 Bitmap := (others => 0);
3616 procedure Set_In_Class
3617 (Bitmap : in out Character_Class;
3620 Value : constant Class_Byte := Character'Pos (C);
3623 Bitmap (Value / 8) := Bitmap (Value / 8)
3624 or Bit_Conversion (Value mod 8);
3631 function String_Length
3632 (Program : Program_Data;
3637 pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF);
3638 return Character'Pos (Program (P + 3));
3641 --------------------
3642 -- String_Operand --
3643 --------------------
3645 function String_Operand (P : Pointer) return Pointer is