Merge branch 'MDL-48467-28' of git://github.com/merrill-oakland/moodle into MOODLE_28...
[moodle.git] / lib / lexer.php
blobca53509fcef0b091ea2a90e3e14a7fecd0a05bb9
1 <?php
3 /**
4 * PHP lexer code snarfed from the CVS tree for the lamplib project at
5 * http://sourceforge.net/projects/lamplib
6 * This project is administered by Markus Baker, Harry Fuecks and Matt
7 * Mitchell, and the project code is in the public domain.
8 *
9 * Thanks, guys!
11 * @package moodlecore
12 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
13 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
16 /** LEXER_ENTER = 1 */
17 define("LEXER_ENTER", 1);
18 /** LEXER_MATCHED = 2 */
19 define("LEXER_MATCHED", 2);
20 /** LEXER_UNMATCHED = 3 */
21 define("LEXER_UNMATCHED", 3);
22 /** LEXER_EXIT = 4 */
23 define("LEXER_EXIT", 4);
24 /** LEXER_SPECIAL = 5 */
25 define("LEXER_SPECIAL", 5);
27 /**
28 * Compounded regular expression. Any of
29 * the contained patterns could match and
30 * when one does it's label is returned.
31 * @package moodlecore
32 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
33 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
35 class ParallelRegex {
36 var $_patterns;
37 var $_labels;
38 var $_regex;
39 var $_case;
41 /**
42 * Constructor. Starts with no patterns.
43 * @param bool $case True for case sensitive, false
44 * for insensitive.
45 * @access public
47 function ParallelRegex($case) {
48 $this->_case = $case;
49 $this->_patterns = array();
50 $this->_labels = array();
51 $this->_regex = null;
54 /**
55 * Adds a pattern with an optional label.
56 * @param string $pattern Perl style regex, but ( and )
57 * lose the usual meaning.
58 * @param string $label Label of regex to be returned
59 * on a match.
60 * @access public
62 function addPattern($pattern, $label = true) {
63 $count = count($this->_patterns);
64 $this->_patterns[$count] = $pattern;
65 $this->_labels[$count] = $label;
66 $this->_regex = null;
69 /**
70 * Attempts to match all patterns at once against
71 * a string.
72 * @param string $subject String to match against.
73 * @param string $match First matched portion of
74 * subject.
75 * @return bool True on success.
76 * @access public
78 function match($subject, &$match) {
79 if (count($this->_patterns) == 0) {
80 return false;
82 if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
83 $match = "";
84 return false;
86 $match = $matches[0];
87 for ($i = 1; $i < count($matches); $i++) {
88 if ($matches[$i]) {
89 return $this->_labels[$i - 1];
92 return true;
95 /**
96 * Compounds the patterns into a single
97 * regular expression separated with the
98 * "or" operator. Caches the regex.
99 * Will automatically escape (, ) and / tokens.
100 * @access private
102 function _getCompoundedRegex() {
103 if ($this->_regex == null) {
104 for ($i = 0; $i < count($this->_patterns); $i++) {
105 $this->_patterns[$i] = '(' . str_replace(
106 array('/', '(', ')'),
107 array('\/', '\(', '\)'),
108 $this->_patterns[$i]) . ')';
110 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
112 return $this->_regex;
116 * Accessor for perl regex mode flags to use.
117 * @return string Flags as string.
118 * @access private
120 function _getPerlMatchingFlags() {
121 return ($this->_case ? "msS" : "msSi");
126 * States for a stack machine.
128 * @package moodlecore
129 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
130 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
132 class StateStack {
133 var $_stack;
136 * Constructor. Starts in named state.
137 * @param string $start Starting state name.
138 * @access public
140 function StateStack($start) {
141 $this->_stack = array($start);
145 * Accessor for current state.
146 * @return string State as string.
147 * @access public
149 function getCurrent() {
150 return $this->_stack[count($this->_stack) - 1];
154 * Adds a state to the stack and sets it
155 * to be the current state.
156 * @param string $state New state.
157 * @access public
159 function enter($state) {
160 array_push($this->_stack, $state);
164 * Leaves the current state and reverts
165 * to the previous one.
166 * @return bool False if we drop off
167 * the bottom of the list.
168 * @access public
170 function leave() {
171 if (count($this->_stack) == 1) {
172 return false;
174 array_pop($this->_stack);
175 return true;
180 * Accepts text and breaks it into tokens.
181 * Some optimisation to make the sure the
182 * content is only scanned by the PHP regex
183 * parser once. Lexer modes must not start
184 * with leading underscores.
186 * @package moodlecore
187 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
188 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
190 class Lexer {
191 var $_regexes;
192 var $_parser;
193 var $_mode;
194 var $_mode_handlers;
195 var $_case;
198 * Sets up the lexer in case insensitive matching
199 * by default.
200 * @param object $parser Handling strategy by
201 * reference.
202 * @param string $start Starting handler.
203 * @param bool $case True for case sensitive.
204 * @access public
206 function Lexer(&$parser, $start = "accept", $case = false) {
207 $this->_case = $case;
208 $this->_regexes = array();
209 $this->_parser = &$parser;
210 $this->_mode = new StateStack($start);
211 $this->_mode_handlers = array();
215 * Adds a token search pattern for a particular
216 * parsing mode. The pattern does not change the
217 * current mode.
218 * @param string $pattern Perl style regex, but ( and )
219 * lose the usual meaning.
220 * @param string $mode Should only apply this
221 * pattern when dealing with
222 * this type of input.
223 * @access public
225 function addPattern($pattern, $mode = "accept") {
226 if (!isset($this->_regexes[$mode])) {
227 $this->_regexes[$mode] = new ParallelRegex($this->_case);
229 $this->_regexes[$mode]->addPattern($pattern);
233 * Adds a pattern that will enter a new parsing
234 * mode. Useful for entering parenthesis, strings,
235 * tags, etc.
236 * @param string $pattern Perl style regex, but ( and )
237 * lose the usual meaning.
238 * @param string $mode Should only apply this
239 * pattern when dealing with
240 * this type of input.
241 * @param string $new_mode Change parsing to this new
242 * nested mode.
243 * @access public
245 function addEntryPattern($pattern, $mode, $new_mode) {
246 if (!isset($this->_regexes[$mode])) {
247 $this->_regexes[$mode] = new ParallelRegex($this->_case);
249 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
253 * Adds a pattern that will exit the current mode
254 * and re-enter the previous one.
255 * @param string $pattern Perl style regex, but ( and )
256 * lose the usual meaning.
257 * @param string $mode Mode to leave.
258 * @access public
260 function addExitPattern($pattern, $mode) {
261 if (!isset($this->_regexes[$mode])) {
262 $this->_regexes[$mode] = new ParallelRegex($this->_case);
264 $this->_regexes[$mode]->addPattern($pattern, "__exit");
268 * Adds a pattern that has a special mode.
269 * Acts as an entry and exit pattern in one go.
270 * @param string $pattern Perl style regex, but ( and )
271 * lose the usual meaning.
272 * @param string $mode Should only apply this
273 * pattern when dealing with
274 * this type of input.
275 * @param string $special Use this mode for this one token.
276 * @access public
278 function addSpecialPattern($pattern, $mode, $special) {
279 if (!isset($this->_regexes[$mode])) {
280 $this->_regexes[$mode] = new ParallelRegex($this->_case);
282 $this->_regexes[$mode]->addPattern($pattern, "_$special");
286 * Adds a mapping from a mode to another handler.
287 * @param string $mode Mode to be remapped.
288 * @param string $handler New target handler.
289 * @access public
291 function mapHandler($mode, $handler) {
292 $this->_mode_handlers[$mode] = $handler;
296 * Splits the page text into tokens. Will fail
297 * if the handlers report an error or if no
298 * content is consumed. If successful then each
299 * unparsed and parsed token invokes a call to the
300 * held listener.
301 * @param string $raw Raw HTML text.
302 * @return bool True on success, else false.
303 * @access public
305 function parse($raw) {
306 if (!isset($this->_parser)) {
307 return false;
309 $length = strlen($raw);
310 while (is_array($parsed = $this->_reduce($raw))) {
311 list($unmatched, $matched, $mode) = $parsed;
312 if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
313 return false;
315 if (strlen($raw) == $length) {
316 return false;
318 $length = strlen($raw);
320 if (!$parsed) {
321 return false;
323 return $this->_invokeParser($raw, LEXER_UNMATCHED);
327 * Sends the matched token and any leading unmatched
328 * text to the parser changing the lexer to a new
329 * mode if one is listed.
330 * @param string $unmatched Unmatched leading portion.
331 * @param string $matched Actual token match.
332 * @param string $mode Mode after match. The "_exit"
333 * mode causes a stack pop. An
334 * false mode causes no change.
335 * @return bool False if there was any error
336 * from the parser.
337 * @access private
339 function _dispatchTokens($unmatched, $matched, $mode = false) {
340 if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
341 return false;
343 if ($mode === "__exit") {
344 if (!$this->_invokeParser($matched, LEXER_EXIT)) {
345 return false;
347 return $this->_mode->leave();
349 if (strncmp($mode, "_", 1) == 0) {
350 $mode = substr($mode, 1);
351 $this->_mode->enter($mode);
352 if (!$this->_invokeParser($matched, LEXER_SPECIAL)) {
353 return false;
355 return $this->_mode->leave();
357 if (is_string($mode)) {
358 $this->_mode->enter($mode);
359 return $this->_invokeParser($matched, LEXER_ENTER);
361 return $this->_invokeParser($matched, LEXER_MATCHED);
365 * Calls the parser method named after the current
366 * mode. Empty content will be ignored.
367 * @param string $content Text parsed.
368 * @param string $is_match Token is recognised rather
369 * than unparsed data.
370 * @access private
372 function _invokeParser($content, $is_match) {
373 if (($content === "") || ($content === false)) {
374 return true;
376 $handler = $this->_mode->getCurrent();
377 if (isset($this->_mode_handlers[$handler])) {
378 $handler = $this->_mode_handlers[$handler];
380 return $this->_parser->$handler($content, $is_match);
384 * Tries to match a chunk of text and if successful
385 * removes the recognised chunk and any leading
386 * unparsed data. Empty strings will not be matched.
387 * @param string $raw The subject to parse. This is the
388 * content that will be eaten.
389 * @return bool|array Three item list of unparsed
390 * content followed by the
391 * recognised token and finally the
392 * action the parser is to take.
393 * True if no match, false if there
394 * is a parsing error.
395 * @access private
397 function _reduce(&$raw) {
398 if (!isset($this->_regexes[$this->_mode->getCurrent()])) {
399 return false;
401 if ($raw === "") {
402 return true;
404 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
405 $count = strpos($raw, $match);
406 $unparsed = substr($raw, 0, $count);
407 $raw = substr($raw, $count + strlen($match));
408 return array($unparsed, $match, $action);
410 return true;