Merge branch 'MDL-64012' of https://github.com/timhunt/moodle
[moodle.git] / lib / lexer.php
blobcca3b85967cbbf1986490f9170b5b08bf282c6b4
1 <?php
3 /**
4 * PHP lexer code snarfed from the CVS tree for the lamplib project at
5 * http://sourceforge.net/projects/lamplib
6 * This project is administered by Markus Baker, Harry Fuecks and Matt
7 * Mitchell, and the project code is in the public domain.
8 *
9 * Thanks, guys!
11 * @package moodlecore
12 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
13 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
16 /** LEXER_ENTER = 1 */
17 define("LEXER_ENTER", 1);
18 /** LEXER_MATCHED = 2 */
19 define("LEXER_MATCHED", 2);
20 /** LEXER_UNMATCHED = 3 */
21 define("LEXER_UNMATCHED", 3);
22 /** LEXER_EXIT = 4 */
23 define("LEXER_EXIT", 4);
24 /** LEXER_SPECIAL = 5 */
25 define("LEXER_SPECIAL", 5);
27 /**
28 * Compounded regular expression. Any of
29 * the contained patterns could match and
30 * when one does it's label is returned.
31 * @package moodlecore
32 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
33 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
35 class ParallelRegex {
36 var $_patterns;
37 var $_labels;
38 var $_regex;
39 var $_case;
41 /**
42 * Constructor. Starts with no patterns.
43 * @param bool $case True for case sensitive, false
44 * for insensitive.
45 * @access public
47 public function __construct($case) {
48 $this->_case = $case;
49 $this->_patterns = array();
50 $this->_labels = array();
51 $this->_regex = null;
54 /**
55 * Old syntax of class constructor. Deprecated in PHP7.
57 * @deprecated since Moodle 3.1
59 public function ParallelRegex($case) {
60 debugging('Use of class name as constructor is deprecated', DEBUG_DEVELOPER);
61 self::__construct($case);
64 /**
65 * Adds a pattern with an optional label.
66 * @param string $pattern Perl style regex, but ( and )
67 * lose the usual meaning.
68 * @param string $label Label of regex to be returned
69 * on a match.
70 * @access public
72 function addPattern($pattern, $label = true) {
73 $count = count($this->_patterns);
74 $this->_patterns[$count] = $pattern;
75 $this->_labels[$count] = $label;
76 $this->_regex = null;
79 /**
80 * Attempts to match all patterns at once against
81 * a string.
82 * @param string $subject String to match against.
83 * @param string $match First matched portion of
84 * subject.
85 * @return bool True on success.
86 * @access public
88 function match($subject, &$match) {
89 if (count($this->_patterns) == 0) {
90 return false;
92 if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
93 $match = "";
94 return false;
96 $match = $matches[0];
97 for ($i = 1; $i < count($matches); $i++) {
98 if ($matches[$i]) {
99 return $this->_labels[$i - 1];
102 return true;
106 * Compounds the patterns into a single
107 * regular expression separated with the
108 * "or" operator. Caches the regex.
109 * Will automatically escape (, ) and / tokens.
110 * @access private
112 function _getCompoundedRegex() {
113 if ($this->_regex == null) {
114 for ($i = 0; $i < count($this->_patterns); $i++) {
115 $this->_patterns[$i] = '(' . str_replace(
116 array('/', '(', ')'),
117 array('\/', '\(', '\)'),
118 $this->_patterns[$i]) . ')';
120 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
122 return $this->_regex;
126 * Accessor for perl regex mode flags to use.
127 * @return string Flags as string.
128 * @access private
130 function _getPerlMatchingFlags() {
131 return ($this->_case ? "msS" : "msSi");
136 * States for a stack machine.
138 * @package moodlecore
139 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
140 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
142 class StateStack {
143 var $_stack;
146 * Constructor. Starts in named state.
147 * @param string $start Starting state name.
148 * @access public
150 public function __construct($start) {
151 $this->_stack = array($start);
155 * Old syntax of class constructor. Deprecated in PHP7.
157 * @deprecated since Moodle 3.1
159 public function StateStack($start) {
160 debugging('Use of class name as constructor is deprecated', DEBUG_DEVELOPER);
161 self::__construct($start);
165 * Accessor for current state.
166 * @return string State as string.
167 * @access public
169 function getCurrent() {
170 return $this->_stack[count($this->_stack) - 1];
174 * Adds a state to the stack and sets it
175 * to be the current state.
176 * @param string $state New state.
177 * @access public
179 function enter($state) {
180 array_push($this->_stack, $state);
184 * Leaves the current state and reverts
185 * to the previous one.
186 * @return bool False if we drop off
187 * the bottom of the list.
188 * @access public
190 function leave() {
191 if (count($this->_stack) == 1) {
192 return false;
194 array_pop($this->_stack);
195 return true;
200 * Accepts text and breaks it into tokens.
201 * Some optimisation to make the sure the
202 * content is only scanned by the PHP regex
203 * parser once. Lexer modes must not start
204 * with leading underscores.
206 * @package moodlecore
207 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
208 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
210 class Lexer {
211 var $_regexes;
212 var $_parser;
213 var $_mode;
214 var $_mode_handlers;
215 var $_case;
218 * Sets up the lexer in case insensitive matching
219 * by default.
220 * @param object $parser Handling strategy by
221 * reference.
222 * @param string $start Starting handler.
223 * @param bool $case True for case sensitive.
224 * @access public
226 public function __construct(&$parser, $start = "accept", $case = false) {
227 $this->_case = $case;
228 $this->_regexes = array();
229 $this->_parser = &$parser;
230 $this->_mode = new StateStack($start);
231 $this->_mode_handlers = array();
235 * Old syntax of class constructor. Deprecated in PHP7.
237 * @deprecated since Moodle 3.1
239 public function Lexer(&$parser, $start = "accept", $case = false) {
240 debugging('Use of class name as constructor is deprecated', DEBUG_DEVELOPER);
241 self::__construct($parser, $start, $case);
245 * Adds a token search pattern for a particular
246 * parsing mode. The pattern does not change the
247 * current mode.
248 * @param string $pattern Perl style regex, but ( and )
249 * lose the usual meaning.
250 * @param string $mode Should only apply this
251 * pattern when dealing with
252 * this type of input.
253 * @access public
255 function addPattern($pattern, $mode = "accept") {
256 if (!isset($this->_regexes[$mode])) {
257 $this->_regexes[$mode] = new ParallelRegex($this->_case);
259 $this->_regexes[$mode]->addPattern($pattern);
263 * Adds a pattern that will enter a new parsing
264 * mode. Useful for entering parenthesis, strings,
265 * tags, etc.
266 * @param string $pattern Perl style regex, but ( and )
267 * lose the usual meaning.
268 * @param string $mode Should only apply this
269 * pattern when dealing with
270 * this type of input.
271 * @param string $new_mode Change parsing to this new
272 * nested mode.
273 * @access public
275 function addEntryPattern($pattern, $mode, $new_mode) {
276 if (!isset($this->_regexes[$mode])) {
277 $this->_regexes[$mode] = new ParallelRegex($this->_case);
279 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
283 * Adds a pattern that will exit the current mode
284 * and re-enter the previous one.
285 * @param string $pattern Perl style regex, but ( and )
286 * lose the usual meaning.
287 * @param string $mode Mode to leave.
288 * @access public
290 function addExitPattern($pattern, $mode) {
291 if (!isset($this->_regexes[$mode])) {
292 $this->_regexes[$mode] = new ParallelRegex($this->_case);
294 $this->_regexes[$mode]->addPattern($pattern, "__exit");
298 * Adds a pattern that has a special mode.
299 * Acts as an entry and exit pattern in one go.
300 * @param string $pattern Perl style regex, but ( and )
301 * lose the usual meaning.
302 * @param string $mode Should only apply this
303 * pattern when dealing with
304 * this type of input.
305 * @param string $special Use this mode for this one token.
306 * @access public
308 function addSpecialPattern($pattern, $mode, $special) {
309 if (!isset($this->_regexes[$mode])) {
310 $this->_regexes[$mode] = new ParallelRegex($this->_case);
312 $this->_regexes[$mode]->addPattern($pattern, "_$special");
316 * Adds a mapping from a mode to another handler.
317 * @param string $mode Mode to be remapped.
318 * @param string $handler New target handler.
319 * @access public
321 function mapHandler($mode, $handler) {
322 $this->_mode_handlers[$mode] = $handler;
326 * Splits the page text into tokens. Will fail
327 * if the handlers report an error or if no
328 * content is consumed. If successful then each
329 * unparsed and parsed token invokes a call to the
330 * held listener.
331 * @param string $raw Raw HTML text.
332 * @return bool True on success, else false.
333 * @access public
335 function parse($raw) {
336 if (!isset($this->_parser)) {
337 return false;
339 $length = strlen($raw);
340 while (is_array($parsed = $this->_reduce($raw))) {
341 list($unmatched, $matched, $mode) = $parsed;
342 if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
343 return false;
345 if (strlen($raw) == $length) {
346 return false;
348 $length = strlen($raw);
350 if (!$parsed) {
351 return false;
353 return $this->_invokeParser($raw, LEXER_UNMATCHED);
357 * Sends the matched token and any leading unmatched
358 * text to the parser changing the lexer to a new
359 * mode if one is listed.
360 * @param string $unmatched Unmatched leading portion.
361 * @param string $matched Actual token match.
362 * @param string $mode Mode after match. The "_exit"
363 * mode causes a stack pop. An
364 * false mode causes no change.
365 * @return bool False if there was any error
366 * from the parser.
367 * @access private
369 function _dispatchTokens($unmatched, $matched, $mode = false) {
370 if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
371 return false;
373 if ($mode === "__exit") {
374 if (!$this->_invokeParser($matched, LEXER_EXIT)) {
375 return false;
377 return $this->_mode->leave();
379 if (strncmp($mode, "_", 1) == 0) {
380 $mode = substr($mode, 1);
381 $this->_mode->enter($mode);
382 if (!$this->_invokeParser($matched, LEXER_SPECIAL)) {
383 return false;
385 return $this->_mode->leave();
387 if (is_string($mode)) {
388 $this->_mode->enter($mode);
389 return $this->_invokeParser($matched, LEXER_ENTER);
391 return $this->_invokeParser($matched, LEXER_MATCHED);
395 * Calls the parser method named after the current
396 * mode. Empty content will be ignored.
397 * @param string $content Text parsed.
398 * @param string $is_match Token is recognised rather
399 * than unparsed data.
400 * @access private
402 function _invokeParser($content, $is_match) {
403 if (($content === "") || ($content === false)) {
404 return true;
406 $handler = $this->_mode->getCurrent();
407 if (isset($this->_mode_handlers[$handler])) {
408 $handler = $this->_mode_handlers[$handler];
410 return $this->_parser->$handler($content, $is_match);
414 * Tries to match a chunk of text and if successful
415 * removes the recognised chunk and any leading
416 * unparsed data. Empty strings will not be matched.
417 * @param string $raw The subject to parse. This is the
418 * content that will be eaten.
419 * @return bool|array Three item list of unparsed
420 * content followed by the
421 * recognised token and finally the
422 * action the parser is to take.
423 * True if no match, false if there
424 * is a parsing error.
425 * @access private
427 function _reduce(&$raw) {
428 if (!isset($this->_regexes[$this->_mode->getCurrent()])) {
429 return false;
431 if ($raw === "") {
432 return true;
434 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
435 $count = strpos($raw, $match);
436 $unparsed = substr($raw, 0, $count);
437 $raw = substr($raw, $count + strlen($match));
438 return array($unparsed, $match, $action);
440 return true;