3 /* PHP lexer code snarfed from the CVS tree for the lamplib project at
4 * http://sourceforge.net/projects/lamplib
5 * This project is administered by Markus Baker, Harry Fuecks and Matt
6 * Mitchell, and the project code is in the public domain.
11 define("LEXER_ENTER", 1);
12 define("LEXER_MATCHED", 2);
13 define("LEXER_UNMATCHED", 3);
14 define("LEXER_EXIT", 4);
15 define("LEXER_SPECIAL", 5);
18 * Compounded regular expression. Any of
19 * the contained patterns could match and
20 * when one does it's label is returned.
29 * Constructor. Starts with no patterns.
30 * @param $case True for case sensitive, false
34 function ParallelRegex($case) {
36 $this->_patterns
= array();
37 $this->_labels
= array();
42 * Adds a pattern with an optional label.
43 * @param $pattern Perl style regex, but ( and )
44 * lose the usual meaning.
45 * @param $label Label of regex to be returned
49 function addPattern($pattern, $label = true) {
50 $count = count($this->_patterns
);
51 $this->_patterns
[$count] = $pattern;
52 $this->_labels
[$count] = $label;
57 * Attempts to match all patterns at once against
59 * @param $subject String to match against.
60 * @param $match First matched portion of
62 * @return True on success.
65 function match($subject, &$match) {
66 if (count($this->_patterns
) == 0) {
69 if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
74 for ($i = 1; $i < count($matches); $i++
) {
76 return $this->_labels
[$i - 1];
83 * Compounds the patterns into a single
84 * regular expression separated with the
85 * "or" operator. Caches the regex.
86 * Will automatically escape (, ) and / tokens.
87 * @param $patterns List of patterns in order.
90 function _getCompoundedRegex() {
91 if ($this->_regex
== null) {
92 for ($i = 0; $i < count($this->_patterns
); $i++
) {
93 $this->_patterns
[$i] = '(' . str_replace(
95 array('\/', '\(', '\)'),
96 $this->_patterns
[$i]) . ')';
98 $this->_regex
= "/" . implode("|", $this->_patterns
) . "/" . $this->_getPerlMatchingFlags();
100 return $this->_regex
;
104 * Accessor for perl regex mode flags to use.
105 * @return Flags as string.
108 function _getPerlMatchingFlags() {
109 return ($this->_case ?
"msS" : "msSi");
114 * States for a stack machine.
120 * Constructor. Starts in named state.
121 * @param $start Starting state name.
124 function StateStack($start) {
125 $this->_stack
= array($start);
129 * Accessor for current state.
130 * @return State as string.
133 function getCurrent() {
134 return $this->_stack
[count($this->_stack
) - 1];
138 * Adds a state to the stack and sets it
139 * to be the current state.
140 * @param $state New state.
143 function enter($state) {
144 array_push($this->_stack
, $state);
148 * Leaves the current state and reverts
149 * to the previous one.
150 * @return False if we drop off
151 * the bottom of the list.
155 if (count($this->_stack
) == 1) {
158 array_pop($this->_stack
);
164 * Accepts text and breaks it into tokens.
165 * Some optimisation to make the sure the
166 * content is only scanned by the PHP regex
167 * parser once. Lexer modes must not start
168 * with leading underscores.
178 * Sets up the lexer in case insensitive matching
180 * @param $parser Handling strategy by
182 * @param $start Starting handler.
183 * @param $case True for case sensitive.
186 function Lexer(&$parser, $start = "accept", $case = false) {
187 $this->_case
= $case;
188 $this->_regexes
= array();
189 $this->_parser
= &$parser;
190 $this->_mode
= new StateStack($start);
191 $this->_mode_handlers
= array();
195 * Adds a token search pattern for a particular
196 * parsing mode. The pattern does not change the
198 * @param $pattern Perl style regex, but ( and )
199 * lose the usual meaning.
200 * @param $mode Should only apply this
201 * pattern when dealing with
202 * this type of input.
205 function addPattern($pattern, $mode = "accept") {
206 if (!isset($this->_regexes
[$mode])) {
207 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
209 $this->_regexes
[$mode]->addPattern($pattern);
213 * Adds a pattern that will enter a new parsing
214 * mode. Useful for entering parenthesis, strings,
216 * @param $pattern Perl style regex, but ( and )
217 * lose the usual meaning.
218 * @param $mode Should only apply this
219 * pattern when dealing with
220 * this type of input.
221 * @param $new_mode Change parsing to this new
225 function addEntryPattern($pattern, $mode, $new_mode) {
226 if (!isset($this->_regexes
[$mode])) {
227 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
229 $this->_regexes
[$mode]->addPattern($pattern, $new_mode);
233 * Adds a pattern that will exit the current mode
234 * and re-enter the previous one.
235 * @param $pattern Perl style regex, but ( and )
236 * lose the usual meaning.
237 * @param $mode Mode to leave.
240 function addExitPattern($pattern, $mode) {
241 if (!isset($this->_regexes
[$mode])) {
242 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
244 $this->_regexes
[$mode]->addPattern($pattern, "__exit");
248 * Adds a pattern that has a special mode.
249 * Acts as an entry and exit pattern in one go.
250 * @param $pattern Perl style regex, but ( and )
251 * lose the usual meaning.
252 * @param $mode Should only apply this
253 * pattern when dealing with
254 * this type of input.
255 * @param $special Use this mode for this one token.
258 function addSpecialPattern($pattern, $mode, $special) {
259 if (!isset($this->_regexes
[$mode])) {
260 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
262 $this->_regexes
[$mode]->addPattern($pattern, "_$special");
266 * Adds a mapping from a mode to another handler.
267 * @param $mode Mode to be remapped.
268 * @param $handler New target handler.
271 function mapHandler($mode, $handler) {
272 $this->_mode_handlers
[$mode] = $handler;
276 * Splits the page text into tokens. Will fail
277 * if the handlers report an error or if no
278 * content is consumed. If successful then each
279 * unparsed and parsed token invokes a call to the
281 * @param $raw Raw HTML text.
282 * @return True on success, else false.
285 function parse($raw) {
286 if (!isset($this->_parser
)) {
289 $length = strlen($raw);
290 while (is_array($parsed = $this->_reduce($raw))) {
291 list($unmatched, $matched, $mode) = $parsed;
292 if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
295 if (strlen($raw) == $length) {
298 $length = strlen($raw);
303 return $this->_invokeParser($raw, LEXER_UNMATCHED
);
307 * Sends the matched token and any leading unmatched
308 * text to the parser changing the lexer to a new
309 * mode if one is listed.
310 * @param $unmatched Unmatched leading portion.
311 * @param $matched Actual token match.
312 * @param $mode Mode after match. The "_exit"
313 * mode causes a stack pop. An
314 * false mode causes no change.
315 * @return False if there was any error
319 function _dispatchTokens($unmatched, $matched, $mode = false) {
320 if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED
)) {
323 if ($mode === "__exit") {
324 if (!$this->_invokeParser($matched, LEXER_EXIT
)) {
327 return $this->_mode
->leave();
329 if (strncmp($mode, "_", 1) == 0) {
330 $mode = substr($mode, 1);
331 $this->_mode
->enter($mode);
332 if (!$this->_invokeParser($matched, LEXER_SPECIAL
)) {
335 return $this->_mode
->leave();
337 if (is_string($mode)) {
338 $this->_mode
->enter($mode);
339 return $this->_invokeParser($matched, LEXER_ENTER
);
341 return $this->_invokeParser($matched, LEXER_MATCHED
);
345 * Calls the parser method named after the current
346 * mode. Empty content will be ignored.
347 * @param $content Text parsed.
348 * @param $is_match Token is recognised rather
349 * than unparsed data.
352 function _invokeParser($content, $is_match) {
353 if (($content === "") ||
($content === false)) {
356 $handler = $this->_mode
->getCurrent();
357 if (isset($this->_mode_handlers
[$handler])) {
358 $handler = $this->_mode_handlers
[$handler];
360 return $this->_parser
->$handler($content, $is_match);
364 * Tries to match a chunk of text and if successful
365 * removes the recognised chunk and any leading
366 * unparsed data. Empty strings will not be matched.
367 * @param $raw The subject to parse. This is the
368 * content that will be eaten.
369 * @return Three item list of unparsed
370 * content followed by the
371 * recognised token and finally the
372 * action the parser is to take.
373 * True if no match, false if there
374 * is a parsing error.
377 function _reduce(&$raw) {
378 if (!isset($this->_regexes
[$this->_mode
->getCurrent()])) {
384 if ($action = $this->_regexes
[$this->_mode
->getCurrent()]->match($raw, $match)) {
385 $count = strpos($raw, $match);
386 $unparsed = substr($raw, 0, $count);
387 $raw = substr($raw, $count +
strlen($match));
388 return array($unparsed, $match, $action);