4 * PHP lexer code snarfed from the CVS tree for the lamplib project at
5 * http://sourceforge.net/projects/lamplib
6 * This project is administered by Markus Baker, Harry Fuecks and Matt
7 * Mitchell, and the project code is in the public domain.
12 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
13 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
16 /** LEXER_ENTER = 1 */
17 define("LEXER_ENTER", 1);
18 /** LEXER_MATCHED = 2 */
19 define("LEXER_MATCHED", 2);
20 /** LEXER_UNMATCHED = 3 */
21 define("LEXER_UNMATCHED", 3);
23 define("LEXER_EXIT", 4);
24 /** LEXER_SPECIAL = 5 */
25 define("LEXER_SPECIAL", 5);
28 * Compounded regular expression. Any of
29 * the contained patterns could match and
30 * when one does it's label is returned.
32 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
33 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
42 * Constructor. Starts with no patterns.
43 * @param bool $case True for case sensitive, false
47 public function __construct($case) {
49 $this->_patterns
= array();
50 $this->_labels
= array();
55 * Old syntax of class constructor. Deprecated in PHP7.
57 * @deprecated since Moodle 3.1
59 public function ParallelRegex($case) {
60 debugging('Use of class name as constructor is deprecated', DEBUG_DEVELOPER
);
61 self
::__construct($case);
65 * Adds a pattern with an optional label.
66 * @param string $pattern Perl style regex, but ( and )
67 * lose the usual meaning.
68 * @param string $label Label of regex to be returned
72 function addPattern($pattern, $label = true) {
73 $count = count($this->_patterns
);
74 $this->_patterns
[$count] = $pattern;
75 $this->_labels
[$count] = $label;
80 * Attempts to match all patterns at once against
82 * @param string $subject String to match against.
83 * @param string $match First matched portion of
85 * @return bool True on success.
88 function match($subject, &$match) {
89 if (count($this->_patterns
) == 0) {
92 if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
97 for ($i = 1; $i < count($matches); $i++
) {
99 return $this->_labels
[$i - 1];
106 * Compounds the patterns into a single
107 * regular expression separated with the
108 * "or" operator. Caches the regex.
109 * Will automatically escape (, ) and / tokens.
112 function _getCompoundedRegex() {
113 if ($this->_regex
== null) {
114 for ($i = 0; $i < count($this->_patterns
); $i++
) {
115 $this->_patterns
[$i] = '(' . str_replace(
116 array('/', '(', ')'),
117 array('\/', '\(', '\)'),
118 $this->_patterns
[$i]) . ')';
120 $this->_regex
= "/" . implode("|", $this->_patterns
) . "/" . $this->_getPerlMatchingFlags();
122 return $this->_regex
;
126 * Accessor for perl regex mode flags to use.
127 * @return string Flags as string.
130 function _getPerlMatchingFlags() {
131 return ($this->_case ?
"msS" : "msSi");
136 * States for a stack machine.
138 * @package moodlecore
139 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
140 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
146 * Constructor. Starts in named state.
147 * @param string $start Starting state name.
150 public function __construct($start) {
151 $this->_stack
= array($start);
155 * Old syntax of class constructor. Deprecated in PHP7.
157 * @deprecated since Moodle 3.1
159 public function StateStack($start) {
160 debugging('Use of class name as constructor is deprecated', DEBUG_DEVELOPER
);
161 self
::__construct($start);
165 * Accessor for current state.
166 * @return string State as string.
169 function getCurrent() {
170 return $this->_stack
[count($this->_stack
) - 1];
174 * Adds a state to the stack and sets it
175 * to be the current state.
176 * @param string $state New state.
179 function enter($state) {
180 array_push($this->_stack
, $state);
184 * Leaves the current state and reverts
185 * to the previous one.
186 * @return bool False if we drop off
187 * the bottom of the list.
191 if (count($this->_stack
) == 1) {
194 array_pop($this->_stack
);
200 * Accepts text and breaks it into tokens.
201 * Some optimisation to make the sure the
202 * content is only scanned by the PHP regex
203 * parser once. Lexer modes must not start
204 * with leading underscores.
206 * @package moodlecore
207 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
208 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
218 * Sets up the lexer in case insensitive matching
220 * @param object $parser Handling strategy by
222 * @param string $start Starting handler.
223 * @param bool $case True for case sensitive.
226 public function __construct(&$parser, $start = "accept", $case = false) {
227 $this->_case
= $case;
228 $this->_regexes
= array();
229 $this->_parser
= &$parser;
230 $this->_mode
= new StateStack($start);
231 $this->_mode_handlers
= array();
235 * Old syntax of class constructor. Deprecated in PHP7.
237 * @deprecated since Moodle 3.1
239 public function Lexer(&$parser, $start = "accept", $case = false) {
240 debugging('Use of class name as constructor is deprecated', DEBUG_DEVELOPER
);
241 self
::__construct($parser, $start, $case);
245 * Adds a token search pattern for a particular
246 * parsing mode. The pattern does not change the
248 * @param string $pattern Perl style regex, but ( and )
249 * lose the usual meaning.
250 * @param string $mode Should only apply this
251 * pattern when dealing with
252 * this type of input.
255 function addPattern($pattern, $mode = "accept") {
256 if (!isset($this->_regexes
[$mode])) {
257 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
259 $this->_regexes
[$mode]->addPattern($pattern);
263 * Adds a pattern that will enter a new parsing
264 * mode. Useful for entering parenthesis, strings,
266 * @param string $pattern Perl style regex, but ( and )
267 * lose the usual meaning.
268 * @param string $mode Should only apply this
269 * pattern when dealing with
270 * this type of input.
271 * @param string $new_mode Change parsing to this new
275 function addEntryPattern($pattern, $mode, $new_mode) {
276 if (!isset($this->_regexes
[$mode])) {
277 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
279 $this->_regexes
[$mode]->addPattern($pattern, $new_mode);
283 * Adds a pattern that will exit the current mode
284 * and re-enter the previous one.
285 * @param string $pattern Perl style regex, but ( and )
286 * lose the usual meaning.
287 * @param string $mode Mode to leave.
290 function addExitPattern($pattern, $mode) {
291 if (!isset($this->_regexes
[$mode])) {
292 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
294 $this->_regexes
[$mode]->addPattern($pattern, "__exit");
298 * Adds a pattern that has a special mode.
299 * Acts as an entry and exit pattern in one go.
300 * @param string $pattern Perl style regex, but ( and )
301 * lose the usual meaning.
302 * @param string $mode Should only apply this
303 * pattern when dealing with
304 * this type of input.
305 * @param string $special Use this mode for this one token.
308 function addSpecialPattern($pattern, $mode, $special) {
309 if (!isset($this->_regexes
[$mode])) {
310 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
312 $this->_regexes
[$mode]->addPattern($pattern, "_$special");
316 * Adds a mapping from a mode to another handler.
317 * @param string $mode Mode to be remapped.
318 * @param string $handler New target handler.
321 function mapHandler($mode, $handler) {
322 $this->_mode_handlers
[$mode] = $handler;
326 * Splits the page text into tokens. Will fail
327 * if the handlers report an error or if no
328 * content is consumed. If successful then each
329 * unparsed and parsed token invokes a call to the
331 * @param string $raw Raw HTML text.
332 * @return bool True on success, else false.
335 function parse($raw) {
336 if (!isset($this->_parser
)) {
339 $length = strlen($raw);
340 while (is_array($parsed = $this->_reduce($raw))) {
341 list($unmatched, $matched, $mode) = $parsed;
342 if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
345 if (strlen($raw) == $length) {
348 $length = strlen($raw);
353 return $this->_invokeParser($raw, LEXER_UNMATCHED
);
357 * Sends the matched token and any leading unmatched
358 * text to the parser changing the lexer to a new
359 * mode if one is listed.
360 * @param string $unmatched Unmatched leading portion.
361 * @param string $matched Actual token match.
362 * @param string $mode Mode after match. The "_exit"
363 * mode causes a stack pop. An
364 * false mode causes no change.
365 * @return bool False if there was any error
369 function _dispatchTokens($unmatched, $matched, $mode = false) {
370 if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED
)) {
373 if ($mode === "__exit") {
374 if (!$this->_invokeParser($matched, LEXER_EXIT
)) {
377 return $this->_mode
->leave();
379 if (strncmp($mode, "_", 1) == 0) {
380 $mode = substr($mode, 1);
381 $this->_mode
->enter($mode);
382 if (!$this->_invokeParser($matched, LEXER_SPECIAL
)) {
385 return $this->_mode
->leave();
387 if (is_string($mode)) {
388 $this->_mode
->enter($mode);
389 return $this->_invokeParser($matched, LEXER_ENTER
);
391 return $this->_invokeParser($matched, LEXER_MATCHED
);
395 * Calls the parser method named after the current
396 * mode. Empty content will be ignored.
397 * @param string $content Text parsed.
398 * @param string $is_match Token is recognised rather
399 * than unparsed data.
402 function _invokeParser($content, $is_match) {
403 if (($content === "") ||
($content === false)) {
406 $handler = $this->_mode
->getCurrent();
407 if (isset($this->_mode_handlers
[$handler])) {
408 $handler = $this->_mode_handlers
[$handler];
410 return $this->_parser
->$handler($content, $is_match);
414 * Tries to match a chunk of text and if successful
415 * removes the recognised chunk and any leading
416 * unparsed data. Empty strings will not be matched.
417 * @param string $raw The subject to parse. This is the
418 * content that will be eaten.
419 * @return bool|array Three item list of unparsed
420 * content followed by the
421 * recognised token and finally the
422 * action the parser is to take.
423 * True if no match, false if there
424 * is a parsing error.
427 function _reduce(&$raw) {
428 if (!isset($this->_regexes
[$this->_mode
->getCurrent()])) {
434 if ($action = $this->_regexes
[$this->_mode
->getCurrent()]->match($raw, $match)) {
435 $count = strpos($raw, $match);
436 $unparsed = substr($raw, 0, $count);
437 $raw = substr($raw, $count +
strlen($match));
438 return array($unparsed, $match, $action);