Adding extra charsets for ActionMailer unit tests, if you're looking to parse incomin...
[akelos.git] / lib / AkLexer.php
blobc11ef9d9bc7ce279710a4d432272bc2f01695b10
1 <?php
2 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
4 // +----------------------------------------------------------------------+
5 // | Akelos Framework - http://www.akelos.org |
6 // +----------------------------------------------------------------------+
7 // | Copyright (c) 2002-2006, Akelos Media, S.L. & Bermi Ferrer Martinez |
8 // | Released under the GNU Lesser General Public License, see LICENSE.txt|
9 // +----------------------------------------------------------------------+
11 /**
12 * Author Markus Baker: http://www.lastcraft.com
13 * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
14 * For an intro to the Lexer see:
15 * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
16 * @author Marcus Baker
17 * @package ActiveSupport
18 * @subpackage GenericParser
21 /**#@+
22 * lexer mode constant
24 define("AK_LEXER_ENTER", 1);
25 define("AK_LEXER_MATCHED", 2);
26 define("AK_LEXER_UNMATCHED", 3);
27 define("AK_LEXER_EXIT", 4);
28 define("AK_LEXER_SPECIAL", 5);
29 /**#@-*/
31 /**
32 * Compounded regular expression. Any of
33 * the contained patterns could match and
34 * when one does it's label is returned.
36 class AkLexerParallelRegex {
37 var $_patterns;
38 var $_labels;
39 var $_regex;
40 var $_case;
42 /**
43 * Constructor. Starts with no patterns.
44 * @param boolean $case True for case sensitive, false
45 * for insensitive.
46 * @access public
48 function AkLexerParallelRegex($case) {
49 $this->_case = $case;
50 $this->_patterns = array();
51 $this->_labels = array();
52 $this->_regex = null;
55 /**
56 * Adds a pattern with an optional label.
57 * @param mixed $pattern Perl style regex. Must be UTF-8
58 * encoded. If its a string, the (, )
59 * lose their meaning unless they
60 * form part of a lookahead or
61 * lookbehind assertation.
62 * @param string $label Label of regex to be returned
63 * on a match. Label must be ASCII
64 * @access public
66 function addPattern($pattern, $label = true) {
67 $count = count($this->_patterns);
68 $this->_patterns[$count] = $pattern;
69 $this->_labels[$count] = $label;
70 $this->_regex = null;
73 /**
74 * Attempts to match all patterns at once against
75 * a string.
76 * @param string $subject String to match against.
77 * @param string $match First matched portion of
78 * subject.
79 * @return boolean True on success.
80 * @access public
82 function match($subject, &$match) {
83 if (count($this->_patterns) == 0) {
84 return false;
86 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
87 $match = '';
88 return false;
91 $match = $matches[0];
92 $size = count($matches);
93 for ($i = 1; $i < $size; $i++) {
94 if ($matches[$i] && isset($this->_labels[$i - 1])) {
95 return $this->_labels[$i - 1];
98 return true;
102 * Attempts to split the string against all patterns at once
104 * @param string $subject String to match against.
105 * @param array $split The split result: array containing, pre-match, match & post-match strings
106 * @return boolean True on success.
107 * @access public
109 * @author Christopher Smith <chris@jalakai.co.uk>
111 function split($subject, &$split) {
112 if (count($this->_patterns) == 0) {
113 return false;
116 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
117 $split = array($subject, "", "");
118 return false;
121 $idx = count($matches)-2;
123 list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
125 $split = array($pre, $matches[0], $post);
126 return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
130 * Compounds the patterns into a single
131 * regular expression separated with the
132 * "or" operator. Caches the regex.
133 * Will automatically escape (, ) and / tokens.
134 * @param array $patterns List of patterns in order.
135 * @access private
137 function _getCompoundedRegex() {
138 if ($this->_regex == null) {
139 $cnt = count($this->_patterns);
140 for ($i = 0; $i < $cnt; $i++) {
142 // Replace lookaheads / lookbehinds with marker
143 $m = "\1\1";
144 $pattern = preg_replace(
145 array (
146 '/\(\?(i|m|s|x|U)\)/U',
147 '/\(\?(\-[i|m|s|x|U])\)/U',
148 '/\(\?\=(.*)\)/sU',
149 '/\(\?\!(.*)\)/sU',
150 '/\(\?\<\=(.*)\)/sU',
151 '/\(\?\<\!(.*)\)/sU',
152 '/\(\?\:(.*)\)/sU',
154 array (
155 $m.'SO:\\1'.$m,
156 $m.'SOR:\\1'.$m,
157 $m.'LA:IS:\\1'.$m,
158 $m.'LA:NOT:\\1'.$m,
159 $m.'LB:IS:\\1'.$m,
160 $m.'LB:NOT:\\1'.$m,
161 $m.'GRP:\\1'.$m,
163 $this->_patterns[$i]
165 // Quote the rest
166 $pattern = str_replace(
167 array('/', '(', ')'),
168 array('\/', '\(', '\)'),
169 $pattern
172 // Restore lookaheads / lookbehinds
173 $pattern = preg_replace(
174 array (
175 '/'.$m.'SO:(.{1})'.$m.'/',
176 '/'.$m.'SOR:(.{2})'.$m.'/',
177 '/'.$m.'LA:IS:(.*)'.$m.'/sU',
178 '/'.$m.'LA:NOT:(.*)'.$m.'/sU',
179 '/'.$m.'LB:IS:(.*)'.$m.'/sU',
180 '/'.$m.'LB:NOT:(.*)'.$m.'/sU',
181 '/'.$m.'GRP:(.*)'.$m.'/sU',
183 array (
184 '(?\\1)',
185 '(?\\1)',
186 '(?=\\1)',
187 '(?!\\1)',
188 '(?<=\\1)',
189 '(?<!\\1)',
190 '(?:\\1)',
192 $pattern
195 $this->_patterns[$i] = '('.$pattern.')';
197 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
199 return $this->_regex;
203 * Accessor for perl regex mode flags to use.
204 * @return string Perl regex flags.
205 * @access private
207 function _getPerlMatchingFlags() {
208 return ($this->_case ? "msS" : "msSi");
213 * States for a stack machine.
215 class AkLexerStateStack {
216 var $_stack;
219 * Constructor. Starts in named state.
220 * @param string $start Starting state name.
221 * @access public
223 function AkLexerStateStack($start) {
224 $this->_stack = array($start);
228 * Accessor for current state.
229 * @return string State.
230 * @access public
232 function getCurrent() {
233 return $this->_stack[count($this->_stack) - 1];
237 * Adds a state to the stack and sets it
238 * to be the current state.
239 * @param string $state New state.
240 * @access public
242 function enter($state) {
243 array_push($this->_stack, $state);
247 * Leaves the current state and reverts
248 * to the previous one.
249 * @return boolean False if we drop off
250 * the bottom of the list.
251 * @access public
253 function leave() {
254 if (count($this->_stack) == 1) {
255 return false;
257 array_pop($this->_stack);
258 return true;
263 * Accepts text and breaks it into tokens.
264 * Some optimisation to make the sure the
265 * content is only scanned by the PHP regex
266 * parser once. Lexer modes must not start
267 * with leading underscores.
269 class AkLexer {
270 var $_regexes;
271 var $_parser;
272 var $_mode;
273 var $_mode_handlers;
274 var $_case;
277 * Sets up the lexer in case insensitive matching
278 * by default.
279 * @param AkParser $parser Handling strategy by
280 * reference.
281 * @param string $start Starting handler.
282 * @param boolean $case True for case sensitive.
283 * @access public
285 function AkLexer(&$parser, $start = 'accept', $case = false) {
286 $this->_case = $case;
287 $this->_regexes = array();
288 $this->_parser = &$parser;
289 $this->_mode = &new AkLexerStateStack($start);
290 $this->_mode_handlers = array();
294 * Adds a token search pattern for a particular
295 * parsing mode. The pattern does not change the
296 * current mode.
297 * @param string $pattern Perl style regex, but ( and )
298 * lose the usual meaning.
299 * @param string $mode Should only apply this
300 * pattern when dealing with
301 * this type of input.
302 * @access public
304 function addPattern($pattern, $mode = "accept") {
305 if (! isset($this->_regexes[$mode])) {
306 $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
308 $this->_regexes[$mode]->addPattern($pattern);
312 * Adds a pattern that will enter a new parsing
313 * mode. Useful for entering parenthesis, strings,
314 * tags, etc.
315 * @param string $pattern Perl style regex, but ( and )
316 * lose the usual meaning.
317 * @param string $mode Should only apply this
318 * pattern when dealing with
319 * this type of input.
320 * @param string $new_mode Change parsing to this new
321 * nested mode.
322 * @access public
324 function addEntryPattern($pattern, $mode, $new_mode) {
325 if (! isset($this->_regexes[$mode])) {
326 $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
328 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
332 * Adds a pattern that will exit the current mode
333 * and re-enter the previous one.
334 * @param string $pattern Perl style regex, but ( and )
335 * lose the usual meaning.
336 * @param string $mode Mode to leave.
337 * @access public
339 function addExitPattern($pattern, $mode) {
340 if (! isset($this->_regexes[$mode])) {
341 $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
343 $this->_regexes[$mode]->addPattern($pattern, '__exit');
347 * Adds a pattern that has a special mode. Acts as an entry
348 * and exit pattern in one go, effectively calling a special
349 * parser handler for this token only.
350 * @param string $pattern Perl style regex, but ( and )
351 * lose the usual meaning.
352 * @param string $mode Should only apply this
353 * pattern when dealing with
354 * this type of input.
355 * @param string $special Use this mode for this one token.
356 * @access public
358 function addSpecialPattern($pattern, $mode, $special) {
359 if (! isset($this->_regexes[$mode])) {
360 $this->_regexes[$mode] = new AkLexerParallelRegex($this->_case);
362 $this->_regexes[$mode]->addPattern($pattern, "_$special");
366 * Adds a mapping from a mode to another handler.
367 * @param string $mode Mode to be remapped.
368 * @param string $handler New target handler.
369 * @access public
371 function mapHandler($mode, $handler) {
372 $this->_mode_handlers[$mode] = $handler;
376 * Splits the page text into tokens. Will fail
377 * if the handlers report an error or if no
378 * content is consumed. If successful then each
379 * unparsed and parsed token invokes a call to the
380 * held listener.
381 * @param string $raw Raw HTML text.
382 * @return boolean True on success, else false.
383 * @access public
385 function parse($raw) {
386 if (! isset($this->_parser)) {
387 return false;
390 $initialLength = strlen($raw);
391 $length = $initialLength;
392 $pos = 0;
393 while (is_array($parsed = $this->_reduce($raw))) {
394 list($unmatched, $matched, $mode) = $parsed;
395 $currentLength = strlen($raw);
396 $matchPos = $initialLength - $currentLength - strlen($matched);
397 if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
398 return false;
400 if ($currentLength == $length) {
401 return false;
403 $length = $currentLength;
404 $pos = $initialLength - $currentLength;
406 if (!$parsed) {
407 return false;
409 return $this->_invokeParser($raw, AK_LEXER_UNMATCHED, $pos);
413 * Sends the matched token and any leading unmatched
414 * text to the parser changing the lexer to a new
415 * mode if one is listed.
416 * @param string $unmatched Unmatched leading portion.
417 * @param string $matched Actual token match.
418 * @param string $mode Mode after match. A boolean
419 * false mode causes no change.
420 * @param int $pos Current byte index location in raw doc
421 * thats being parsed
422 * @return boolean False if there was any error
423 * from the parser.
424 * @access private
426 function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
427 if (! $this->_invokeParser($unmatched, AK_LEXER_UNMATCHED, $initialPos) ){
428 return false;
430 if ($this->_isModeEnd($mode)) {
431 if (! $this->_invokeParser($matched, AK_LEXER_EXIT, $matchPos)) {
432 return false;
434 return $this->_mode->leave();
436 if ($this->_isSpecialMode($mode)) {
437 $this->_mode->enter($this->_decodeSpecial($mode));
438 if (! $this->_invokeParser($matched, AK_LEXER_SPECIAL, $matchPos)) {
439 return false;
441 return $this->_mode->leave();
443 if (is_string($mode)) {
444 $this->_mode->enter($mode);
445 return $this->_invokeParser($matched, AK_LEXER_ENTER, $matchPos);
447 return $this->_invokeParser($matched, AK_LEXER_MATCHED, $matchPos);
451 * Tests to see if the new mode is actually to leave
452 * the current mode and pop an item from the matching
453 * mode stack.
454 * @param string $mode Mode to test.
455 * @return boolean True if this is the exit mode.
456 * @access private
458 function _isModeEnd($mode) {
459 return ($mode === "__exit");
463 * Test to see if the mode is one where this mode
464 * is entered for this token only and automatically
465 * leaves immediately afterwoods.
466 * @param string $mode Mode to test.
467 * @return boolean True if this is the exit mode.
468 * @access private
470 function _isSpecialMode($mode) {
471 return (strncmp($mode, "_", 1) == 0);
475 * Strips the magic underscore marking single token
476 * modes.
477 * @param string $mode Mode to decode.
478 * @return string Underlying mode name.
479 * @access private
481 function _decodeSpecial($mode) {
482 return substr($mode, 1);
486 * Calls the parser method named after the current
487 * mode. Empty content will be ignored. The lexer
488 * has a parser handler for each mode in the lexer.
489 * @param string $content Text parsed.
490 * @param boolean $is_match Token is recognised rather
491 * than unparsed data.
492 * @param int $pos Current byte index location in raw doc
493 * thats being parsed
494 * @access private
496 function _invokeParser($content, $is_match, $pos) {
497 if (($content === '') || ($content === false)) {
498 return true;
500 $handler = $this->_mode->getCurrent();
501 if (isset($this->_mode_handlers[$handler])) {
502 $handler = $this->_mode_handlers[$handler];
504 return $this->_parser->$handler($content, $is_match, $pos);
508 * Tries to match a chunk of text and if successful
509 * removes the recognised chunk and any leading
510 * unparsed data. Empty strings will not be matched.
511 * @param string $raw The subject to parse. This is the
512 * content that will be eaten.
513 * @return array Three item list of unparsed
514 * content followed by the
515 * recognised token and finally the
516 * action the parser is to take.
517 * True if no match, false if there
518 * is a parsing error.
519 * @access private
521 function _reduce(&$raw) {
522 if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
523 return false;
525 if ($raw === "") {
526 return true;
528 if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
529 list($unparsed, $match, $raw) = $split;
530 return array($unparsed, $match, $action);
532 return true;