[testsuite]
[official-gcc.git] / gcc / ada / libgnat / s-regexp.adb
blob9e53c4ebdae915e8ebd46eaa28362dfa4d40f919
1 ------------------------------------------------------------------------------
2 -- --
3 -- GNAT COMPILER COMPONENTS --
4 -- --
5 -- S Y S T E M . R E G E X P --
6 -- --
7 -- B o d y --
8 -- --
9 -- Copyright (C) 1999-2018, AdaCore --
10 -- --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
17 -- --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
21 -- --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
26 -- --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
29 -- --
30 ------------------------------------------------------------------------------
32 with Ada.Unchecked_Deallocation;
33 with System.Case_Util;
35 package body System.Regexp is
37 Initial_Max_States_In_Primary_Table : constant := 100;
38 -- Initial size for the number of states in the indefinite state
39 -- machine. The number of states will be increased as needed.
41 -- This is also used as the maximal number of meta states (groups of
42 -- states) in the secondary table.
44 Open_Paren : constant Character := '(';
45 Close_Paren : constant Character := ')';
46 Open_Bracket : constant Character := '[';
47 Close_Bracket : constant Character := ']';
49 type State_Index is new Natural;
50 type Column_Index is new Natural;
52 type Regexp_Array is array
53 (State_Index range <>, Column_Index range <>) of State_Index;
54 -- First index is for the state number. Second index is for the character
55 -- type. Contents is the new State.
57 type Regexp_Array_Access is access Regexp_Array;
58 -- Use this type through the functions Set below, so that it can grow
59 -- dynamically depending on the needs.
61 type Mapping is array (Character'Range) of Column_Index;
62 -- Mapping between characters and column in the Regexp_Array
64 type Boolean_Array is array (State_Index range <>) of Boolean;
66 type Regexp_Value
67 (Alphabet_Size : Column_Index;
68 Num_States : State_Index) is
69 record
70 Map : Mapping;
71 Case_Sensitive : Boolean;
72 States : Regexp_Array (1 .. Num_States, 0 .. Alphabet_Size);
73 Is_Final : Boolean_Array (1 .. Num_States);
74 end record;
75 -- Deterministic finite-state machine
77 -----------------------
78 -- Local Subprograms --
79 -----------------------
81 procedure Set
82 (Table : in out Regexp_Array_Access;
83 State : State_Index;
84 Column : Column_Index;
85 Value : State_Index);
86 -- Sets a value in the table. If the table is too small, reallocate it
87 -- dynamically so that (State, Column) is a valid index in it.
89 function Get
90 (Table : Regexp_Array_Access;
91 State : State_Index;
92 Column : Column_Index) return State_Index;
93 -- Returns the value in the table at (State, Column). If this index does
94 -- not exist in the table, returns zero.
96 procedure Free is new Ada.Unchecked_Deallocation
97 (Regexp_Array, Regexp_Array_Access);
99 ------------
100 -- Adjust --
101 ------------
103 procedure Adjust (R : in out Regexp) is
104 Tmp : Regexp_Access;
105 begin
106 if R.R /= null then
107 Tmp := new Regexp_Value (Alphabet_Size => R.R.Alphabet_Size,
108 Num_States => R.R.Num_States);
109 Tmp.all := R.R.all;
110 R.R := Tmp;
111 end if;
112 end Adjust;
114 -------------
115 -- Compile --
116 -------------
118 function Compile
119 (Pattern : String;
120 Glob : Boolean := False;
121 Case_Sensitive : Boolean := True) return Regexp
123 S : String := Pattern;
124 -- The pattern which is really compiled (when the pattern is case
125 -- insensitive, we convert this string to lower-cases
127 Map : Mapping := (others => 0);
128 -- Mapping between characters and columns in the tables
130 Alphabet_Size : Column_Index := 0;
131 -- Number of significant characters in the regular expression.
132 -- This total does not include special operators, such as *, (, ...
134 procedure Check_Well_Formed_Pattern;
135 -- Check that the pattern to compile is well-formed, so that subsequent
136 -- code can rely on this without performing each time the checks to
137 -- avoid accessing the pattern outside its bounds. However, not all
138 -- well-formedness rules are checked. In particular, rules about special
139 -- characters not being treated as regular characters are not checked.
141 procedure Create_Mapping;
142 -- Creates a mapping between characters in the regexp and columns
143 -- in the tables representing the regexp. Test that the regexp is
144 -- well-formed Modifies Alphabet_Size and Map
146 procedure Create_Primary_Table
147 (Table : out Regexp_Array_Access;
148 Num_States : out State_Index;
149 Start_State : out State_Index;
150 End_State : out State_Index);
151 -- Creates the first version of the regexp (this is a non deterministic
152 -- finite state machine, which is unadapted for a fast pattern
153 -- matching algorithm). We use a recursive algorithm to process the
154 -- parenthesis sub-expressions.
156 -- Table : at the end of the procedure : Column 0 is for any character
157 -- ('.') and the last columns are for no character (closure). Num_States
158 -- is set to the number of states in the table Start_State is the number
159 -- of the starting state in the regexp End_State is the number of the
160 -- final state when the regexp matches.
162 procedure Create_Primary_Table_Glob
163 (Table : out Regexp_Array_Access;
164 Num_States : out State_Index;
165 Start_State : out State_Index;
166 End_State : out State_Index);
167 -- Same function as above, but it deals with the second possible
168 -- grammar for 'globbing pattern', which is a kind of subset of the
169 -- whole regular expression grammar.
171 function Create_Secondary_Table
172 (First_Table : Regexp_Array_Access;
173 Start_State : State_Index;
174 End_State : State_Index) return Regexp;
175 -- Creates the definitive table representing the regular expression
176 -- This is actually a transformation of the primary table First_Table,
177 -- where every state is grouped with the states in its 'no-character'
178 -- columns. The transitions between the new states are then recalculated
179 -- and if necessary some new states are created.
181 -- Note that the resulting finite-state machine is not optimized in
182 -- terms of the number of states : it would be more time-consuming to
183 -- add a third pass to reduce the number of states in the machine, with
184 -- no speed improvement...
186 procedure Raise_Exception (M : String; Index : Integer);
187 pragma No_Return (Raise_Exception);
188 -- Raise an exception, indicating an error at character Index in S
190 -------------------------------
191 -- Check_Well_Formed_Pattern --
192 -------------------------------
194 procedure Check_Well_Formed_Pattern is
195 J : Integer;
197 Past_Elmt : Boolean := False;
198 -- Set to True everywhere an elmt has been parsed, if Glob=False,
199 -- meaning there can be now an occurrence of '*', '+' and '?'.
201 Past_Term : Boolean := False;
202 -- Set to True everywhere a term has been parsed, if Glob=False,
203 -- meaning there can be now an occurrence of '|'.
205 Parenthesis_Level : Integer := 0;
206 Curly_Level : Integer := 0;
208 Last_Open : Integer := S'First - 1;
209 -- The last occurrence of an opening parenthesis, if Glob=False,
210 -- or the last occurrence of an opening curly brace, if Glob=True.
212 procedure Raise_Exception_If_No_More_Chars (K : Integer := 0);
213 -- If no more characters are raised, call Raise_Exception
215 --------------------------------------
216 -- Raise_Exception_If_No_More_Chars --
217 --------------------------------------
219 procedure Raise_Exception_If_No_More_Chars (K : Integer := 0) is
220 begin
221 if J + K > S'Last then
222 Raise_Exception ("Ill-formed pattern while parsing", J);
223 end if;
224 end Raise_Exception_If_No_More_Chars;
226 -- Start of processing for Check_Well_Formed_Pattern
228 begin
229 J := S'First;
230 while J <= S'Last loop
231 case S (J) is
232 when Open_Bracket =>
233 J := J + 1;
234 Raise_Exception_If_No_More_Chars;
236 if not Glob then
237 if S (J) = '^' then
238 J := J + 1;
239 Raise_Exception_If_No_More_Chars;
240 end if;
241 end if;
243 -- The first character never has a special meaning
245 if S (J) = ']' or else S (J) = '-' then
246 J := J + 1;
247 Raise_Exception_If_No_More_Chars;
248 end if;
250 -- The set of characters cannot be empty
252 if S (J) = ']' then
253 Raise_Exception
254 ("Set of characters cannot be empty in regular "
255 & "expression", J);
256 end if;
258 declare
259 Possible_Range_Start : Boolean := True;
260 -- Set True everywhere a range character '-' can occur
262 begin
263 loop
264 exit when S (J) = Close_Bracket;
266 -- The current character should be followed by a
267 -- closing bracket.
269 Raise_Exception_If_No_More_Chars (1);
271 if S (J) = '-'
272 and then S (J + 1) /= Close_Bracket
273 then
274 if not Possible_Range_Start then
275 Raise_Exception
276 ("No mix of ranges is allowed in "
277 & "regular expression", J);
278 end if;
280 J := J + 1;
281 Raise_Exception_If_No_More_Chars;
283 -- Range cannot be followed by '-' character,
284 -- except as last character in the set.
286 Possible_Range_Start := False;
288 else
289 Possible_Range_Start := True;
290 end if;
292 if S (J) = '\' then
293 J := J + 1;
294 Raise_Exception_If_No_More_Chars;
295 end if;
297 J := J + 1;
298 end loop;
299 end;
301 -- A closing bracket can end an elmt or term
303 Past_Elmt := True;
304 Past_Term := True;
306 when Close_Bracket =>
308 -- A close bracket must follow a open_bracket, and cannot be
309 -- found alone on the line.
311 Raise_Exception
312 ("Incorrect character ']' in regular expression", J);
314 when '\' =>
315 if J < S'Last then
316 J := J + 1;
318 -- Any character can be an elmt or a term
320 Past_Elmt := True;
321 Past_Term := True;
323 else
324 -- \ not allowed at the end of the regexp
326 Raise_Exception
327 ("Incorrect character '\' in regular expression", J);
328 end if;
330 when Open_Paren =>
331 if not Glob then
332 Parenthesis_Level := Parenthesis_Level + 1;
333 Last_Open := J;
335 -- An open parenthesis does not end an elmt or term
337 Past_Elmt := False;
338 Past_Term := False;
339 end if;
341 when Close_Paren =>
342 if not Glob then
343 Parenthesis_Level := Parenthesis_Level - 1;
345 if Parenthesis_Level < 0 then
346 Raise_Exception
347 ("')' is not associated with '(' in regular "
348 & "expression", J);
349 end if;
351 if J = Last_Open + 1 then
352 Raise_Exception
353 ("Empty parentheses not allowed in regular "
354 & "expression", J);
355 end if;
357 if not Past_Term then
358 Raise_Exception
359 ("Closing parenthesis not allowed here in regular "
360 & "expression", J);
361 end if;
363 -- A closing parenthesis can end an elmt or term
365 Past_Elmt := True;
366 Past_Term := True;
367 end if;
369 when '{' =>
370 if Glob then
371 Curly_Level := Curly_Level + 1;
372 Last_Open := J;
374 else
375 -- Any character can be an elmt or a term
377 Past_Elmt := True;
378 Past_Term := True;
379 end if;
381 -- No need to check for ',' as the code always accepts them
383 when '}' =>
384 if Glob then
385 Curly_Level := Curly_Level - 1;
387 if Curly_Level < 0 then
388 Raise_Exception
389 ("'}' is not associated with '{' in regular "
390 & "expression", J);
391 end if;
393 if J = Last_Open + 1 then
394 Raise_Exception
395 ("Empty curly braces not allowed in regular "
396 & "expression", J);
397 end if;
399 else
400 -- Any character can be an elmt or a term
402 Past_Elmt := True;
403 Past_Term := True;
404 end if;
406 when '*' | '?' | '+' =>
407 if not Glob then
409 -- These operators must apply to an elmt sub-expression,
410 -- and cannot be found if one has not just been parsed.
412 if not Past_Elmt then
413 Raise_Exception
414 ("'*', '+' and '?' operators must be "
415 & "applied to an element in regular expression", J);
416 end if;
418 Past_Elmt := False;
419 Past_Term := True;
420 end if;
422 when '|' =>
423 if not Glob then
425 -- This operator must apply to a term sub-expression,
426 -- and cannot be found if one has not just been parsed.
428 if not Past_Term then
429 Raise_Exception
430 ("'|' operator must be "
431 & "applied to a term in regular expression", J);
432 end if;
434 Past_Elmt := False;
435 Past_Term := False;
436 end if;
438 when others =>
439 if not Glob then
441 -- Any character can be an elmt or a term
443 Past_Elmt := True;
444 Past_Term := True;
445 end if;
446 end case;
448 J := J + 1;
449 end loop;
451 -- A closing parenthesis must follow an open parenthesis
453 if Parenthesis_Level /= 0 then
454 Raise_Exception
455 ("'(' must always be associated with a ')'", J);
456 end if;
458 -- A closing curly brace must follow an open curly brace
460 if Curly_Level /= 0 then
461 Raise_Exception
462 ("'{' must always be associated with a '}'", J);
463 end if;
464 end Check_Well_Formed_Pattern;
466 --------------------
467 -- Create_Mapping --
468 --------------------
470 procedure Create_Mapping is
472 procedure Add_In_Map (C : Character);
473 -- Add a character in the mapping, if it is not already defined
475 ----------------
476 -- Add_In_Map --
477 ----------------
479 procedure Add_In_Map (C : Character) is
480 begin
481 if Map (C) = 0 then
482 Alphabet_Size := Alphabet_Size + 1;
483 Map (C) := Alphabet_Size;
484 end if;
485 end Add_In_Map;
487 J : Integer := S'First;
488 Parenthesis_Level : Integer := 0;
489 Curly_Level : Integer := 0;
490 Last_Open : Integer := S'First - 1;
492 -- Start of processing for Create_Mapping
494 begin
495 while J <= S'Last loop
496 case S (J) is
497 when Open_Bracket =>
498 J := J + 1;
500 if S (J) = '^' then
501 J := J + 1;
502 end if;
504 if S (J) = ']' or else S (J) = '-' then
505 J := J + 1;
506 end if;
508 -- The first character never has a special meaning
510 loop
511 if J > S'Last then
512 Raise_Exception
513 ("Ran out of characters while parsing ", J);
514 end if;
516 exit when S (J) = Close_Bracket;
518 if S (J) = '-'
519 and then S (J + 1) /= Close_Bracket
520 then
521 declare
522 Start : constant Integer := J - 1;
524 begin
525 J := J + 1;
527 if S (J) = '\' then
528 J := J + 1;
529 end if;
531 for Char in S (Start) .. S (J) loop
532 Add_In_Map (Char);
533 end loop;
534 end;
535 else
536 if S (J) = '\' then
537 J := J + 1;
538 end if;
540 Add_In_Map (S (J));
541 end if;
543 J := J + 1;
544 end loop;
546 -- A close bracket must follow a open_bracket and cannot be
547 -- found alone on the line
549 when Close_Bracket =>
550 Raise_Exception
551 ("Incorrect character ']' in regular expression", J);
553 when '\' =>
554 if J < S'Last then
555 J := J + 1;
556 Add_In_Map (S (J));
558 else
559 -- Back slash \ not allowed at the end of the regexp
561 Raise_Exception
562 ("Incorrect character '\' in regular expression", J);
563 end if;
565 when Open_Paren =>
566 if not Glob then
567 Parenthesis_Level := Parenthesis_Level + 1;
568 Last_Open := J;
569 else
570 Add_In_Map (Open_Paren);
571 end if;
573 when Close_Paren =>
574 if not Glob then
575 Parenthesis_Level := Parenthesis_Level - 1;
577 if Parenthesis_Level < 0 then
578 Raise_Exception
579 ("')' is not associated with '(' in regular "
580 & "expression", J);
581 end if;
583 if J = Last_Open + 1 then
584 Raise_Exception
585 ("Empty parenthesis not allowed in regular "
586 & "expression", J);
587 end if;
589 else
590 Add_In_Map (Close_Paren);
591 end if;
593 when '.' =>
594 if Glob then
595 Add_In_Map ('.');
596 end if;
598 when '{' =>
599 if not Glob then
600 Add_In_Map (S (J));
601 else
602 Curly_Level := Curly_Level + 1;
603 end if;
605 when '}' =>
606 if not Glob then
607 Add_In_Map (S (J));
608 else
609 Curly_Level := Curly_Level - 1;
610 end if;
612 when '*' | '?' =>
613 if not Glob then
614 if J = S'First then
615 Raise_Exception
616 ("'*', '+', '?' and '|' operators cannot be in "
617 & "first position in regular expression", J);
618 end if;
619 end if;
621 when '|' | '+' =>
622 if not Glob then
623 if J = S'First then
625 -- These operators must apply to a sub-expression,
626 -- and cannot be found at the beginning of the line
628 Raise_Exception
629 ("'*', '+', '?' and '|' operators cannot be in "
630 & "first position in regular expression", J);
631 end if;
633 else
634 Add_In_Map (S (J));
635 end if;
637 when others =>
638 Add_In_Map (S (J));
639 end case;
641 J := J + 1;
642 end loop;
644 -- A closing parenthesis must follow an open parenthesis
646 if Parenthesis_Level /= 0 then
647 Raise_Exception
648 ("'(' must always be associated with a ')'", J);
649 end if;
651 if Curly_Level /= 0 then
652 Raise_Exception
653 ("'{' must always be associated with a '}'", J);
654 end if;
655 end Create_Mapping;
657 --------------------------
658 -- Create_Primary_Table --
659 --------------------------
661 procedure Create_Primary_Table
662 (Table : out Regexp_Array_Access;
663 Num_States : out State_Index;
664 Start_State : out State_Index;
665 End_State : out State_Index)
667 Empty_Char : constant Column_Index := Alphabet_Size + 1;
669 Current_State : State_Index := 0;
670 -- Index of the last created state
672 procedure Add_Empty_Char
673 (State : State_Index;
674 To_State : State_Index);
675 -- Add a empty-character transition from State to To_State
677 procedure Create_Repetition
678 (Repetition : Character;
679 Start_Prev : State_Index;
680 End_Prev : State_Index;
681 New_Start : out State_Index;
682 New_End : in out State_Index);
683 -- Create the table in case we have a '*', '+' or '?'.
684 -- Start_Prev .. End_Prev should indicate respectively the start and
685 -- end index of the previous expression, to which '*', '+' or '?' is
686 -- applied.
688 procedure Create_Simple
689 (Start_Index : Integer;
690 End_Index : Integer;
691 Start_State : out State_Index;
692 End_State : out State_Index);
693 -- Fill the table for the regexp Simple. This is the recursive
694 -- procedure called to handle () expressions If End_State = 0, then
695 -- the call to Create_Simple creates an independent regexp, not a
696 -- concatenation Start_Index .. End_Index is the starting index in
697 -- the string S.
699 -- Warning: it may look like we are creating too many empty-string
700 -- transitions, but they are needed to get the correct regexp.
701 -- The table is filled as follow ( s means start-state, e means
702 -- end-state) :
704 -- regexp state_num | a b * empty_string
705 -- ------- ------------------------------
706 -- a 1 (s) | 2 - - -
707 -- 2 (e) | - - - -
709 -- ab 1 (s) | 2 - - -
710 -- 2 | - - - 3
711 -- 3 | - 4 - -
712 -- 4 (e) | - - - -
714 -- a|b 1 | 2 - - -
715 -- 2 | - - - 6
716 -- 3 | - 4 - -
717 -- 4 | - - - 6
718 -- 5 (s) | - - - 1,3
719 -- 6 (e) | - - - -
721 -- a* 1 | 2 - - -
722 -- 2 | - - - 4
723 -- 3 (s) | - - - 1,4
724 -- 4 (e) | - - - 3
726 -- (a) 1 (s) | 2 - - -
727 -- 2 (e) | - - - -
729 -- a+ 1 | 2 - - -
730 -- 2 | - - - 4
731 -- 3 (s) | - - - 1
732 -- 4 (e) | - - - 3
734 -- a? 1 | 2 - - -
735 -- 2 | - - - 4
736 -- 3 (s) | - - - 1,4
737 -- 4 (e) | - - - -
739 -- . 1 (s) | 2 2 2 -
740 -- 2 (e) | - - - -
742 function Next_Sub_Expression
743 (Start_Index : Integer;
744 End_Index : Integer) return Integer;
745 -- Returns the index of the last character of the next sub-expression
746 -- in Simple. Index cannot be greater than End_Index.
748 --------------------
749 -- Add_Empty_Char --
750 --------------------
752 procedure Add_Empty_Char
753 (State : State_Index;
754 To_State : State_Index)
756 J : Column_Index := Empty_Char;
758 begin
759 while Get (Table, State, J) /= 0 loop
760 J := J + 1;
761 end loop;
763 Set (Table, State, J, To_State);
764 end Add_Empty_Char;
766 -----------------------
767 -- Create_Repetition --
768 -----------------------
770 procedure Create_Repetition
771 (Repetition : Character;
772 Start_Prev : State_Index;
773 End_Prev : State_Index;
774 New_Start : out State_Index;
775 New_End : in out State_Index)
777 begin
778 New_Start := Current_State + 1;
780 if New_End /= 0 then
781 Add_Empty_Char (New_End, New_Start);
782 end if;
784 Current_State := Current_State + 2;
785 New_End := Current_State;
787 Add_Empty_Char (End_Prev, New_End);
788 Add_Empty_Char (New_Start, Start_Prev);
790 if Repetition /= '+' then
791 Add_Empty_Char (New_Start, New_End);
792 end if;
794 if Repetition /= '?' then
795 Add_Empty_Char (New_End, New_Start);
796 end if;
797 end Create_Repetition;
799 -------------------
800 -- Create_Simple --
801 -------------------
803 procedure Create_Simple
804 (Start_Index : Integer;
805 End_Index : Integer;
806 Start_State : out State_Index;
807 End_State : out State_Index)
809 J : Integer := Start_Index;
810 Last_Start : State_Index := 0;
812 begin
813 Start_State := 0;
814 End_State := 0;
815 while J <= End_Index loop
816 case S (J) is
817 when Open_Paren =>
818 declare
819 J_Start : constant Integer := J + 1;
820 Next_Start : State_Index;
821 Next_End : State_Index;
823 begin
824 J := Next_Sub_Expression (J, End_Index);
825 Create_Simple (J_Start, J - 1, Next_Start, Next_End);
827 if J < End_Index
828 and then (S (J + 1) = '*' or else
829 S (J + 1) = '+' or else
830 S (J + 1) = '?')
831 then
832 J := J + 1;
833 Create_Repetition
834 (S (J),
835 Next_Start,
836 Next_End,
837 Last_Start,
838 End_State);
840 else
841 Last_Start := Next_Start;
843 if End_State /= 0 then
844 Add_Empty_Char (End_State, Last_Start);
845 end if;
847 End_State := Next_End;
848 end if;
849 end;
851 when '|' =>
852 declare
853 Start_Prev : constant State_Index := Start_State;
854 End_Prev : constant State_Index := End_State;
855 Start_J : constant Integer := J + 1;
856 Start_Next : State_Index := 0;
857 End_Next : State_Index := 0;
859 begin
860 J := Next_Sub_Expression (J, End_Index);
862 -- Create a new state for the start of the alternative
864 Current_State := Current_State + 1;
865 Last_Start := Current_State;
866 Start_State := Last_Start;
868 -- Create the tree for the second part of alternative
870 Create_Simple (Start_J, J, Start_Next, End_Next);
872 -- Create the end state
874 Add_Empty_Char (Last_Start, Start_Next);
875 Add_Empty_Char (Last_Start, Start_Prev);
876 Current_State := Current_State + 1;
877 End_State := Current_State;
878 Add_Empty_Char (End_Prev, End_State);
879 Add_Empty_Char (End_Next, End_State);
880 end;
882 when Open_Bracket =>
883 Current_State := Current_State + 1;
885 declare
886 Next_State : State_Index := Current_State + 1;
888 begin
889 J := J + 1;
891 if S (J) = '^' then
892 J := J + 1;
894 Next_State := 0;
896 for Column in 0 .. Alphabet_Size loop
897 Set (Table, Current_State, Column,
898 Value => Current_State + 1);
899 end loop;
900 end if;
902 -- Automatically add the first character
904 if S (J) = '-' or else S (J) = ']' then
905 Set (Table, Current_State, Map (S (J)),
906 Value => Next_State);
907 J := J + 1;
908 end if;
910 -- Loop till closing bracket found
912 loop
913 exit when S (J) = Close_Bracket;
915 if S (J) = '-'
916 and then S (J + 1) /= ']'
917 then
918 declare
919 Start : constant Integer := J - 1;
921 begin
922 J := J + 1;
924 if S (J) = '\' then
925 J := J + 1;
926 end if;
928 for Char in S (Start) .. S (J) loop
929 Set (Table, Current_State, Map (Char),
930 Value => Next_State);
931 end loop;
932 end;
934 else
935 if S (J) = '\' then
936 J := J + 1;
937 end if;
939 Set (Table, Current_State, Map (S (J)),
940 Value => Next_State);
941 end if;
942 J := J + 1;
943 end loop;
944 end;
946 Current_State := Current_State + 1;
948 -- If the next symbol is a special symbol
950 if J < End_Index
951 and then (S (J + 1) = '*' or else
952 S (J + 1) = '+' or else
953 S (J + 1) = '?')
954 then
955 J := J + 1;
956 Create_Repetition
957 (S (J),
958 Current_State - 1,
959 Current_State,
960 Last_Start,
961 End_State);
963 else
964 Last_Start := Current_State - 1;
966 if End_State /= 0 then
967 Add_Empty_Char (End_State, Last_Start);
968 end if;
970 End_State := Current_State;
971 end if;
973 when Close_Bracket
974 | Close_Paren
975 | '*' | '+' | '?'
977 Raise_Exception
978 ("Incorrect character in regular expression :", J);
980 when others =>
981 Current_State := Current_State + 1;
983 -- Create the state for the symbol S (J)
985 if S (J) = '.' then
986 for K in 0 .. Alphabet_Size loop
987 Set (Table, Current_State, K,
988 Value => Current_State + 1);
989 end loop;
991 else
992 if S (J) = '\' then
993 J := J + 1;
994 end if;
996 Set (Table, Current_State, Map (S (J)),
997 Value => Current_State + 1);
998 end if;
1000 Current_State := Current_State + 1;
1002 -- If the next symbol is a special symbol
1004 if J < End_Index
1005 and then (S (J + 1) = '*' or else
1006 S (J + 1) = '+' or else
1007 S (J + 1) = '?')
1008 then
1009 J := J + 1;
1010 Create_Repetition
1011 (S (J),
1012 Current_State - 1,
1013 Current_State,
1014 Last_Start,
1015 End_State);
1017 else
1018 Last_Start := Current_State - 1;
1020 if End_State /= 0 then
1021 Add_Empty_Char (End_State, Last_Start);
1022 end if;
1024 End_State := Current_State;
1025 end if;
1026 end case;
1028 if Start_State = 0 then
1029 Start_State := Last_Start;
1030 end if;
1032 J := J + 1;
1033 end loop;
1034 end Create_Simple;
1036 -------------------------
1037 -- Next_Sub_Expression --
1038 -------------------------
1040 function Next_Sub_Expression
1041 (Start_Index : Integer;
1042 End_Index : Integer) return Integer
1044 J : Integer := Start_Index;
1045 Start_On_Alter : Boolean := False;
1047 begin
1048 if S (J) = '|' then
1049 Start_On_Alter := True;
1050 end if;
1052 loop
1053 exit when J = End_Index;
1054 J := J + 1;
1056 case S (J) is
1057 when '\' =>
1058 J := J + 1;
1060 when Open_Bracket =>
1061 loop
1062 J := J + 1;
1063 exit when S (J) = Close_Bracket;
1065 if S (J) = '\' then
1066 J := J + 1;
1067 end if;
1068 end loop;
1070 when Open_Paren =>
1071 J := Next_Sub_Expression (J, End_Index);
1073 when Close_Paren =>
1074 return J;
1076 when '|' =>
1077 if Start_On_Alter then
1078 return J - 1;
1079 end if;
1081 when others =>
1082 null;
1083 end case;
1084 end loop;
1086 return J;
1087 end Next_Sub_Expression;
1089 -- Start of processing for Create_Primary_Table
1091 begin
1092 Table.all := (others => (others => 0));
1093 Create_Simple (S'First, S'Last, Start_State, End_State);
1094 Num_States := Current_State;
1095 end Create_Primary_Table;
1097 -------------------------------
1098 -- Create_Primary_Table_Glob --
1099 -------------------------------
1101 procedure Create_Primary_Table_Glob
1102 (Table : out Regexp_Array_Access;
1103 Num_States : out State_Index;
1104 Start_State : out State_Index;
1105 End_State : out State_Index)
1107 Empty_Char : constant Column_Index := Alphabet_Size + 1;
1109 Current_State : State_Index := 0;
1110 -- Index of the last created state
1112 procedure Add_Empty_Char
1113 (State : State_Index;
1114 To_State : State_Index);
1115 -- Add a empty-character transition from State to To_State
1117 procedure Create_Simple
1118 (Start_Index : Integer;
1119 End_Index : Integer;
1120 Start_State : out State_Index;
1121 End_State : out State_Index);
1122 -- Fill the table for the S (Start_Index .. End_Index).
1123 -- This is the recursive procedure called to handle () expressions
1125 --------------------
1126 -- Add_Empty_Char --
1127 --------------------
1129 procedure Add_Empty_Char
1130 (State : State_Index;
1131 To_State : State_Index)
1133 J : Column_Index;
1135 begin
1136 J := Empty_Char;
1137 while Get (Table, State, J) /= 0 loop
1138 J := J + 1;
1139 end loop;
1141 Set (Table, State, J, Value => To_State);
1142 end Add_Empty_Char;
1144 -------------------
1145 -- Create_Simple --
1146 -------------------
1148 procedure Create_Simple
1149 (Start_Index : Integer;
1150 End_Index : Integer;
1151 Start_State : out State_Index;
1152 End_State : out State_Index)
1154 J : Integer;
1155 Last_Start : State_Index := 0;
1157 begin
1158 Start_State := 0;
1159 End_State := 0;
1161 J := Start_Index;
1162 while J <= End_Index loop
1163 case S (J) is
1164 when Open_Bracket =>
1165 Current_State := Current_State + 1;
1167 declare
1168 Next_State : State_Index := Current_State + 1;
1170 begin
1171 J := J + 1;
1173 if S (J) = '^' then
1174 J := J + 1;
1175 Next_State := 0;
1177 for Column in 0 .. Alphabet_Size loop
1178 Set (Table, Current_State, Column,
1179 Value => Current_State + 1);
1180 end loop;
1181 end if;
1183 -- Automatically add the first character
1185 if S (J) = '-' or else S (J) = ']' then
1186 Set (Table, Current_State, Map (S (J)),
1187 Value => Current_State);
1188 J := J + 1;
1189 end if;
1191 -- Loop till closing bracket found
1193 loop
1194 exit when S (J) = Close_Bracket;
1196 if S (J) = '-'
1197 and then S (J + 1) /= ']'
1198 then
1199 declare
1200 Start : constant Integer := J - 1;
1202 begin
1203 J := J + 1;
1205 if S (J) = '\' then
1206 J := J + 1;
1207 end if;
1209 for Char in S (Start) .. S (J) loop
1210 Set (Table, Current_State, Map (Char),
1211 Value => Next_State);
1212 end loop;
1213 end;
1215 else
1216 if S (J) = '\' then
1217 J := J + 1;
1218 end if;
1220 Set (Table, Current_State, Map (S (J)),
1221 Value => Next_State);
1222 end if;
1223 J := J + 1;
1224 end loop;
1225 end;
1227 Last_Start := Current_State;
1228 Current_State := Current_State + 1;
1230 if End_State /= 0 then
1231 Add_Empty_Char (End_State, Last_Start);
1232 end if;
1234 End_State := Current_State;
1236 when '{' =>
1237 declare
1238 End_Sub : Integer;
1239 Start_Regexp_Sub : State_Index;
1240 End_Regexp_Sub : State_Index;
1241 Create_Start : State_Index := 0;
1243 Create_End : State_Index := 0;
1244 -- Initialized to avoid junk warning
1246 begin
1247 while S (J) /= '}' loop
1249 -- First step : find sub pattern
1251 End_Sub := J + 1;
1252 while S (End_Sub) /= ','
1253 and then S (End_Sub) /= '}'
1254 loop
1255 End_Sub := End_Sub + 1;
1256 end loop;
1258 -- Second step : create a sub pattern
1260 Create_Simple
1261 (J + 1,
1262 End_Sub - 1,
1263 Start_Regexp_Sub,
1264 End_Regexp_Sub);
1266 J := End_Sub;
1268 -- Third step : create an alternative
1270 if Create_Start = 0 then
1271 Current_State := Current_State + 1;
1272 Create_Start := Current_State;
1273 Add_Empty_Char (Create_Start, Start_Regexp_Sub);
1274 Current_State := Current_State + 1;
1275 Create_End := Current_State;
1276 Add_Empty_Char (End_Regexp_Sub, Create_End);
1278 else
1279 Current_State := Current_State + 1;
1280 Add_Empty_Char (Current_State, Create_Start);
1281 Create_Start := Current_State;
1282 Add_Empty_Char (Create_Start, Start_Regexp_Sub);
1283 Add_Empty_Char (End_Regexp_Sub, Create_End);
1284 end if;
1285 end loop;
1287 if End_State /= 0 then
1288 Add_Empty_Char (End_State, Create_Start);
1289 end if;
1291 End_State := Create_End;
1292 Last_Start := Create_Start;
1293 end;
1295 when '*' =>
1296 Current_State := Current_State + 1;
1298 if End_State /= 0 then
1299 Add_Empty_Char (End_State, Current_State);
1300 end if;
1302 Add_Empty_Char (Current_State, Current_State + 1);
1303 Add_Empty_Char (Current_State, Current_State + 3);
1304 Last_Start := Current_State;
1306 Current_State := Current_State + 1;
1308 for K in 0 .. Alphabet_Size loop
1309 Set (Table, Current_State, K,
1310 Value => Current_State + 1);
1311 end loop;
1313 Current_State := Current_State + 1;
1314 Add_Empty_Char (Current_State, Current_State + 1);
1316 Current_State := Current_State + 1;
1317 Add_Empty_Char (Current_State, Last_Start);
1318 End_State := Current_State;
1320 when others =>
1321 Current_State := Current_State + 1;
1323 if S (J) = '?' then
1324 for K in 0 .. Alphabet_Size loop
1325 Set (Table, Current_State, K,
1326 Value => Current_State + 1);
1327 end loop;
1329 else
1330 if S (J) = '\' then
1331 J := J + 1;
1332 end if;
1334 -- Create the state for the symbol S (J)
1336 Set (Table, Current_State, Map (S (J)),
1337 Value => Current_State + 1);
1338 end if;
1340 Last_Start := Current_State;
1341 Current_State := Current_State + 1;
1343 if End_State /= 0 then
1344 Add_Empty_Char (End_State, Last_Start);
1345 end if;
1347 End_State := Current_State;
1348 end case;
1350 if Start_State = 0 then
1351 Start_State := Last_Start;
1352 end if;
1354 J := J + 1;
1355 end loop;
1356 end Create_Simple;
1358 -- Start of processing for Create_Primary_Table_Glob
1360 begin
1361 Table.all := (others => (others => 0));
1362 Create_Simple (S'First, S'Last, Start_State, End_State);
1363 Num_States := Current_State;
1364 end Create_Primary_Table_Glob;
1366 ----------------------------
1367 -- Create_Secondary_Table --
1368 ----------------------------
1370 function Create_Secondary_Table
1371 (First_Table : Regexp_Array_Access;
1372 Start_State : State_Index;
1373 End_State : State_Index) return Regexp
1375 Last_Index : constant State_Index := First_Table'Last (1);
1377 type Meta_State is array (0 .. Last_Index) of Boolean;
1378 pragma Pack (Meta_State);
1379 -- Whether a state from first_table belongs to a metastate.
1381 No_States : constant Meta_State := (others => False);
1383 type Meta_States_Array is array (State_Index range <>) of Meta_State;
1384 type Meta_States_List is access all Meta_States_Array;
1385 procedure Unchecked_Free is new Ada.Unchecked_Deallocation
1386 (Meta_States_Array, Meta_States_List);
1387 Meta_States : Meta_States_List;
1388 -- Components of meta-states. A given state might belong to
1389 -- several meta-states.
1390 -- This array grows dynamically.
1392 type Char_To_State is array (0 .. Alphabet_Size) of State_Index;
1393 type Meta_States_Transition_Arr is
1394 array (State_Index range <>) of Char_To_State;
1395 type Meta_States_Transition is access all Meta_States_Transition_Arr;
1396 procedure Unchecked_Free is new Ada.Unchecked_Deallocation
1397 (Meta_States_Transition_Arr, Meta_States_Transition);
1398 Table : Meta_States_Transition;
1399 -- Documents the transitions between each meta-state. The
1400 -- first index is the meta-state, the second column is the
1401 -- character seen in the input, the value is the new meta-state.
1403 Temp_State_Not_Null : Boolean;
1405 Current_State : State_Index := 1;
1406 -- The current meta-state we are creating
1408 Nb_State : State_Index := 1;
1409 -- The total number of meta-states created so far.
1411 procedure Closure
1412 (Meta_State : State_Index;
1413 State : State_Index);
1414 -- Compute the closure of the state (that is every other state which
1415 -- has a empty-character transition) and add it to the state
1417 procedure Ensure_Meta_State (Meta : State_Index);
1418 -- grows the Meta_States array as needed to make sure that there
1419 -- is enough space to store the new meta state.
1421 -----------------------
1422 -- Ensure_Meta_State --
1423 -----------------------
1425 procedure Ensure_Meta_State (Meta : State_Index) is
1426 Tmp : Meta_States_List := Meta_States;
1427 Tmp2 : Meta_States_Transition := Table;
1429 begin
1430 if Meta_States = null then
1431 Meta_States := new Meta_States_Array
1432 (1 .. State_Index'Max (Last_Index, Meta) + 1);
1433 Meta_States (Meta_States'Range) := (others => No_States);
1435 Table := new Meta_States_Transition_Arr
1436 (1 .. State_Index'Max (Last_Index, Meta) + 1);
1437 Table.all := (others => (others => 0));
1439 elsif Meta > Meta_States'Last then
1440 Meta_States := new Meta_States_Array
1441 (1 .. State_Index'Max (2 * Tmp'Last, Meta));
1442 Meta_States (Tmp'Range) := Tmp.all;
1443 Meta_States (Tmp'Last + 1 .. Meta_States'Last) :=
1444 (others => No_States);
1445 Unchecked_Free (Tmp);
1447 Table := new Meta_States_Transition_Arr
1448 (1 .. State_Index'Max (2 * Tmp2'Last, Meta) + 1);
1449 Table (Tmp2'Range) := Tmp2.all;
1450 Table (Tmp2'Last + 1 .. Table'Last) :=
1451 (others => (others => 0));
1452 Unchecked_Free (Tmp2);
1453 end if;
1454 end Ensure_Meta_State;
1456 -------------
1457 -- Closure --
1458 -------------
1460 procedure Closure
1461 (Meta_State : State_Index;
1462 State : State_Index)
1464 begin
1465 if not Meta_States (Meta_State)(State) then
1466 Meta_States (Meta_State)(State) := True;
1468 -- For each transition on empty-character
1470 for Column in Alphabet_Size + 1 .. First_Table'Last (2) loop
1471 exit when First_Table (State, Column) = 0;
1472 Closure (Meta_State, First_Table (State, Column));
1473 end loop;
1474 end if;
1475 end Closure;
1477 -- Start of processing for Create_Secondary_Table
1479 begin
1480 -- Create a new state
1482 Ensure_Meta_State (Current_State);
1483 Closure (Current_State, Start_State);
1485 while Current_State <= Nb_State loop
1487 -- We will be trying, below, to create the next meta-state
1489 Ensure_Meta_State (Nb_State + 1);
1491 -- For every character in the regexp, calculate the possible
1492 -- transitions from Current_State.
1494 for Column in 0 .. Alphabet_Size loop
1495 Temp_State_Not_Null := False;
1497 for K in Meta_States (Current_State)'Range loop
1498 if Meta_States (Current_State)(K)
1499 and then First_Table (K, Column) /= 0
1500 then
1501 Closure (Nb_State + 1, First_Table (K, Column));
1502 Temp_State_Not_Null := True;
1503 end if;
1504 end loop;
1506 -- If at least one transition existed
1508 if Temp_State_Not_Null then
1510 -- Check if this new state corresponds to an old one
1512 for K in 1 .. Nb_State loop
1513 if Meta_States (K) = Meta_States (Nb_State + 1) then
1514 Table (Current_State)(Column) := K;
1516 -- Reset data, for the next time we try that state
1518 Meta_States (Nb_State + 1) := No_States;
1519 exit;
1520 end if;
1521 end loop;
1523 -- If not, create a new state
1525 if Table (Current_State)(Column) = 0 then
1526 Nb_State := Nb_State + 1;
1527 Ensure_Meta_State (Nb_State + 1);
1528 Table (Current_State)(Column) := Nb_State;
1529 end if;
1530 end if;
1531 end loop;
1533 Current_State := Current_State + 1;
1534 end loop;
1536 -- Returns the regexp
1538 declare
1539 R : Regexp_Access;
1541 begin
1542 R := new Regexp_Value (Alphabet_Size => Alphabet_Size,
1543 Num_States => Nb_State);
1544 R.Map := Map;
1545 R.Case_Sensitive := Case_Sensitive;
1547 for S in 1 .. Nb_State loop
1548 R.Is_Final (S) := Meta_States (S)(End_State);
1549 end loop;
1551 for State in 1 .. Nb_State loop
1552 for K in 0 .. Alphabet_Size loop
1553 R.States (State, K) := Table (State)(K);
1554 end loop;
1555 end loop;
1557 Unchecked_Free (Meta_States);
1558 Unchecked_Free (Table);
1560 return (Ada.Finalization.Controlled with R => R);
1561 end;
1562 end Create_Secondary_Table;
1564 ---------------------
1565 -- Raise_Exception --
1566 ---------------------
1568 procedure Raise_Exception (M : String; Index : Integer) is
1569 begin
1570 raise Error_In_Regexp with M & " at offset" & Index'Img;
1571 end Raise_Exception;
1573 -- Start of processing for Compile
1575 begin
1576 -- Special case for the empty string: it always matches, and the
1577 -- following processing would fail on it.
1579 if S = "" then
1580 return (Ada.Finalization.Controlled with
1581 R => new Regexp_Value'
1582 (Alphabet_Size => 0,
1583 Num_States => 1,
1584 Map => (others => 0),
1585 States => (others => (others => 1)),
1586 Is_Final => (others => True),
1587 Case_Sensitive => True));
1588 end if;
1590 if not Case_Sensitive then
1591 System.Case_Util.To_Lower (S);
1592 end if;
1594 -- Check the pattern is well-formed before any treatment
1596 Check_Well_Formed_Pattern;
1598 Create_Mapping;
1600 -- Creates the primary table
1602 declare
1603 Table : Regexp_Array_Access;
1604 Num_States : State_Index;
1605 Start_State : State_Index;
1606 End_State : State_Index;
1607 R : Regexp;
1609 begin
1610 Table := new Regexp_Array (1 .. Initial_Max_States_In_Primary_Table,
1611 0 .. Alphabet_Size + 10);
1612 if not Glob then
1613 Create_Primary_Table (Table, Num_States, Start_State, End_State);
1614 else
1615 Create_Primary_Table_Glob
1616 (Table, Num_States, Start_State, End_State);
1617 end if;
1619 -- Creates the secondary table
1621 R := Create_Secondary_Table (Table, Start_State, End_State);
1622 Free (Table);
1623 return R;
1624 end;
1625 end Compile;
1627 --------------
1628 -- Finalize --
1629 --------------
1631 procedure Finalize (R : in out Regexp) is
1632 procedure Free is new
1633 Ada.Unchecked_Deallocation (Regexp_Value, Regexp_Access);
1634 begin
1635 Free (R.R);
1636 end Finalize;
1638 ---------
1639 -- Get --
1640 ---------
1642 function Get
1643 (Table : Regexp_Array_Access;
1644 State : State_Index;
1645 Column : Column_Index) return State_Index
1647 begin
1648 if State <= Table'Last (1)
1649 and then Column <= Table'Last (2)
1650 then
1651 return Table (State, Column);
1652 else
1653 return 0;
1654 end if;
1655 end Get;
1657 -----------
1658 -- Match --
1659 -----------
1661 function Match (S : String; R : Regexp) return Boolean is
1662 Current_State : State_Index := 1;
1664 begin
1665 if R.R = null then
1666 raise Constraint_Error;
1667 end if;
1669 for Char in S'Range loop
1671 if R.R.Case_Sensitive then
1672 Current_State := R.R.States (Current_State, R.R.Map (S (Char)));
1673 else
1674 Current_State :=
1675 R.R.States (Current_State,
1676 R.R.Map (System.Case_Util.To_Lower (S (Char))));
1677 end if;
1679 if Current_State = 0 then
1680 return False;
1681 end if;
1683 end loop;
1685 return R.R.Is_Final (Current_State);
1686 end Match;
1688 ---------
1689 -- Set --
1690 ---------
1692 procedure Set
1693 (Table : in out Regexp_Array_Access;
1694 State : State_Index;
1695 Column : Column_Index;
1696 Value : State_Index)
1698 New_Lines : State_Index;
1699 New_Columns : Column_Index;
1700 New_Table : Regexp_Array_Access;
1702 begin
1703 if State <= Table'Last (1)
1704 and then Column <= Table'Last (2)
1705 then
1706 Table (State, Column) := Value;
1707 else
1708 -- Doubles the size of the table until it is big enough that
1709 -- (State, Column) is a valid index.
1711 New_Lines := Table'Last (1) * (State / Table'Last (1) + 1);
1712 New_Columns := Table'Last (2) * (Column / Table'Last (2) + 1);
1713 New_Table := new Regexp_Array (Table'First (1) .. New_Lines,
1714 Table'First (2) .. New_Columns);
1715 New_Table.all := (others => (others => 0));
1717 for J in Table'Range (1) loop
1718 for K in Table'Range (2) loop
1719 New_Table (J, K) := Table (J, K);
1720 end loop;
1721 end loop;
1723 Free (Table);
1724 Table := New_Table;
1725 Table (State, Column) := Value;
1726 end if;
1727 end Set;
1729 end System.Regexp;