initial
[prop.git] / include / AD / automata / regexmat.h
blob173fd858f4c0137cc42342232421ceff0b05dcc9
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are welcomed to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome(read crave for)
19 // any suggestions and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #ifndef DFA_based_regular_expression_string_matcher_h
26 #define DFA_based_regular_expression_string_matcher_h
28 #include <AD/generic/generic.h>
29 #include <AD/automata/lexer.h>
31 //////////////////////////////////////////////////////////////////////////////
32 // Class |RegexMatch| implements a dfa based regular expression
33 // string matcher.
34 //////////////////////////////////////////////////////////////////////////////
35 class RegexMatch : public Lexer {
37 RegexMatch(const RegexMatch&); // no copy constructor
38 void operator = (const RegexMatch&); // no assignment
40 public:
42 ///////////////////////////////////////////////////////////////////////////
43 // Inherit some types
44 ///////////////////////////////////////////////////////////////////////////
45 typedef Lexer Super;
46 typedef Super::Symbol Symbol;
47 typedef Super::State State;
48 typedef Super::Offset Offset;
49 typedef Super::Rule Rule;
51 public:
53 ///////////////////////////////////////////////////////////////////////////
54 // Constructor
55 ///////////////////////////////////////////////////////////////////////////
56 RegexMatch( const Offset base_table [],
57 const State check_table [],
58 const State def_table [],
59 const State next_table [],
60 const Rule rule_table [],
61 const unsigned char equiv_table []
63 : Lexer(base_table, check_table, def_table, next_table,
64 rule_table, equiv_table) {}
66 ///////////////////////////////////////////////////////////////////////////
67 // String matching operations.
68 // Returns the matching rule number (which is compiled in the tables).
69 // If more than one rule matches that the highest priority rule is
70 // returned. Rules are numbered from 1 - n.
72 // If nothing matches then 0 is returned.
73 ///////////////////////////////////////////////////////////////////////////
74 inline Rule MatchText(const char *) const;
75 inline Rule MatchText(const char *, int) const;
76 inline Rule MatchText(const char *, const char *&) const;
77 inline Rule MatchText(const char *, int, const char *&) const;
78 inline Rule MatchText(State, const char *, const char *&) const;
79 inline Rule MatchText(State, const char *, int, const char *&, Bool = false) const;
80 Rule MatchText(State, class LexerBuffer&, const char *) const;
83 //////////////////////////////////////////////////////////////////////////////
84 // Matching operation on strings. The entire string is scanned and no
85 // backtracking is performed. The string must be null-terminated.
86 //////////////////////////////////////////////////////////////////////////////
87 inline RegexMatch::Rule RegexMatch::MatchText(register const char * string) const
88 { register State state;
89 register unsigned char c;
90 for (state = start_state; (c = *string) != '\0'; string++)
91 if ((state = go(state,c)) == 0) return 0;
92 return accept(state);
95 //////////////////////////////////////////////////////////////////////////////
96 // Matching operations on strings. This is the same as the previous
97 // version except that an extra length parameters is used. Nul's may
98 // be embedded in the string.
99 //////////////////////////////////////////////////////////////////////////////
100 inline RegexMatch::Rule RegexMatch::MatchText
101 (register const char * string, register int len) const
102 { register State state;
103 for (state = start_state; len > 0; len--) {
104 register unsigned char c = *string++;
105 if ((state = go(state,c)) == 0) return 0;
107 return accept(state);
110 //////////////////////////////////////////////////////////////////////////////
111 // Matching operation on string.
112 // This version assumes backtracking is used. We'll only try to Match
113 // the longest substring. A pointer to the rest of the string is returned.
114 // The user must also supply a start state.
116 // The special code -1 is returned if the buffer space has run out
117 // without reaching the end of the stream.
119 // Caution: the tables must be generated with the backtracking option.
120 //////////////////////////////////////////////////////////////////////////////
121 inline RegexMatch::Rule RegexMatch::MatchText
122 (register State state, register const char * s, const char *& nxt) const
123 { register unsigned char c;
124 Rule last_rule = 0;
125 const char * last_position;
127 { register Rule r = accept(state = go(state,c = *s++));
128 if (r > 0) { // state is backtrackable
129 last_rule = r; last_position = s;
130 } else if (r == -1) { // dead end state
131 // try to backtrack if possible
132 if (last_rule > 0) {
133 // we have a last rule and position, just use it dude!
134 nxt = last_position;
135 return last_rule;
136 } else {
137 // problem, return 0 for error
138 // next is set to the error position.
139 nxt = s;
140 return 0;
142 } else if (r < -1) { // last posible accept state
143 // this is the last possible position
144 nxt = s;
145 return -r-1;
147 } while (c);
149 // we have run out of buffer
150 // return -1 to signal this condition
151 return -1;
154 //////////////////////////////////////////////////////////////////////////////
155 // The version assumes that the buffer length has been explicitly
156 // supplied.
157 //////////////////////////////////////////////////////////////////////////////
158 inline RegexMatch::Rule RegexMatch::MatchText
159 ( State state,
160 const char * s,
161 int len,
162 const char *& n,
163 Bool more
164 ) const
166 Rule last_rule = 0;
167 const char * last_position;
168 for (;;)
170 Rule r;
171 if (len == 0)
172 { if (more) break;
173 else goto DONE;
174 } else
175 { state = go(state,*s++);
176 r = accept(state);
178 if (r > 0) { // state is backtrackable
179 last_rule = r; last_position = s; --len;
180 } else if (r == -1) { // dead end state
181 DONE:
182 // try to back track if possible
183 if (last_rule > 0) {
184 // we have a last rule and position, just use it dude!
185 n = last_position;
186 return last_rule;
187 } else {
188 // problem, return 0 for error
189 // n is set to the error position.
190 n = s;
191 return 0;
193 } else if (r < -1) { // last posible accept state
194 // this is the last possible position
195 n = s;
196 return -r-1;
200 // we have run out of buffer
201 // return -1 to signal this condition
202 return -1;
205 //////////////////////////////////////////////////////////////////////////////
206 // These version assumes the default start state should be used.
207 //////////////////////////////////////////////////////////////////////////////
208 inline RegexMatch::Rule RegexMatch::MatchText(const char * s, const char *& n) const
209 { return MatchText (start_state, s, n); }
210 inline RegexMatch::Rule RegexMatch::MatchText(const char * s, int len, const char *& n) const
211 { return MatchText (start_state, s, len, n); }
213 #endif