4 * PHP lexer code snarfed from the CVS tree for the lamplib project at
5 * http://sourceforge.net/projects/lamplib
6 * This project is administered by Markus Baker, Harry Fuecks and Matt
7 * Mitchell, and the project code is in the public domain.
12 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
13 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
16 /** LEXER_ENTER = 1 */
17 define("LEXER_ENTER", 1);
18 /** LEXER_MATCHED = 2 */
19 define("LEXER_MATCHED", 2);
20 /** LEXER_UNMATCHED = 3 */
21 define("LEXER_UNMATCHED", 3);
23 define("LEXER_EXIT", 4);
24 /** LEXER_SPECIAL = 5 */
25 define("LEXER_SPECIAL", 5);
28 * Compounded regular expression. Any of
29 * the contained patterns could match and
30 * when one does it's label is returned.
32 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
33 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
42 * Constructor. Starts with no patterns.
43 * @param bool $case True for case sensitive, false
47 public function __construct($case) {
49 $this->_patterns
= array();
50 $this->_labels
= array();
55 * Old syntax of class constructor. Deprecated in PHP7.
57 public function ParallelRegex($case) {
58 self
::__construct($case);
62 * Adds a pattern with an optional label.
63 * @param string $pattern Perl style regex, but ( and )
64 * lose the usual meaning.
65 * @param string $label Label of regex to be returned
69 function addPattern($pattern, $label = true) {
70 $count = count($this->_patterns
);
71 $this->_patterns
[$count] = $pattern;
72 $this->_labels
[$count] = $label;
77 * Attempts to match all patterns at once against
79 * @param string $subject String to match against.
80 * @param string $match First matched portion of
82 * @return bool True on success.
85 function match($subject, &$match) {
86 if (count($this->_patterns
) == 0) {
89 if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
94 for ($i = 1; $i < count($matches); $i++
) {
96 return $this->_labels
[$i - 1];
103 * Compounds the patterns into a single
104 * regular expression separated with the
105 * "or" operator. Caches the regex.
106 * Will automatically escape (, ) and / tokens.
109 function _getCompoundedRegex() {
110 if ($this->_regex
== null) {
111 for ($i = 0; $i < count($this->_patterns
); $i++
) {
112 $this->_patterns
[$i] = '(' . str_replace(
113 array('/', '(', ')'),
114 array('\/', '\(', '\)'),
115 $this->_patterns
[$i]) . ')';
117 $this->_regex
= "/" . implode("|", $this->_patterns
) . "/" . $this->_getPerlMatchingFlags();
119 return $this->_regex
;
123 * Accessor for perl regex mode flags to use.
124 * @return string Flags as string.
127 function _getPerlMatchingFlags() {
128 return ($this->_case ?
"msS" : "msSi");
133 * States for a stack machine.
135 * @package moodlecore
136 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
137 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
143 * Constructor. Starts in named state.
144 * @param string $start Starting state name.
147 public function __construct($start) {
148 $this->_stack
= array($start);
152 * Old syntax of class constructor. Deprecated in PHP7.
154 public function StateStack($start) {
155 self
::__construct($start);
159 * Accessor for current state.
160 * @return string State as string.
163 function getCurrent() {
164 return $this->_stack
[count($this->_stack
) - 1];
168 * Adds a state to the stack and sets it
169 * to be the current state.
170 * @param string $state New state.
173 function enter($state) {
174 array_push($this->_stack
, $state);
178 * Leaves the current state and reverts
179 * to the previous one.
180 * @return bool False if we drop off
181 * the bottom of the list.
185 if (count($this->_stack
) == 1) {
188 array_pop($this->_stack
);
194 * Accepts text and breaks it into tokens.
195 * Some optimisation to make the sure the
196 * content is only scanned by the PHP regex
197 * parser once. Lexer modes must not start
198 * with leading underscores.
200 * @package moodlecore
201 * @copyright Markus Baker, Harry Fuecks and Matt Mitchell
202 * @license Public Domain {@link http://sourceforge.net/projects/lamplib}
212 * Sets up the lexer in case insensitive matching
214 * @param object $parser Handling strategy by
216 * @param string $start Starting handler.
217 * @param bool $case True for case sensitive.
220 public function __construct(&$parser, $start = "accept", $case = false) {
221 $this->_case
= $case;
222 $this->_regexes
= array();
223 $this->_parser
= &$parser;
224 $this->_mode
= new StateStack($start);
225 $this->_mode_handlers
= array();
229 * Old syntax of class constructor for backward compatibility.
231 public function Lexer(&$parser, $start = "accept", $case = false) {
232 self
::__construct($parser, $start, $case);
236 * Adds a token search pattern for a particular
237 * parsing mode. The pattern does not change the
239 * @param string $pattern Perl style regex, but ( and )
240 * lose the usual meaning.
241 * @param string $mode Should only apply this
242 * pattern when dealing with
243 * this type of input.
246 function addPattern($pattern, $mode = "accept") {
247 if (!isset($this->_regexes
[$mode])) {
248 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
250 $this->_regexes
[$mode]->addPattern($pattern);
254 * Adds a pattern that will enter a new parsing
255 * mode. Useful for entering parenthesis, strings,
257 * @param string $pattern Perl style regex, but ( and )
258 * lose the usual meaning.
259 * @param string $mode Should only apply this
260 * pattern when dealing with
261 * this type of input.
262 * @param string $new_mode Change parsing to this new
266 function addEntryPattern($pattern, $mode, $new_mode) {
267 if (!isset($this->_regexes
[$mode])) {
268 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
270 $this->_regexes
[$mode]->addPattern($pattern, $new_mode);
274 * Adds a pattern that will exit the current mode
275 * and re-enter the previous one.
276 * @param string $pattern Perl style regex, but ( and )
277 * lose the usual meaning.
278 * @param string $mode Mode to leave.
281 function addExitPattern($pattern, $mode) {
282 if (!isset($this->_regexes
[$mode])) {
283 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
285 $this->_regexes
[$mode]->addPattern($pattern, "__exit");
289 * Adds a pattern that has a special mode.
290 * Acts as an entry and exit pattern in one go.
291 * @param string $pattern Perl style regex, but ( and )
292 * lose the usual meaning.
293 * @param string $mode Should only apply this
294 * pattern when dealing with
295 * this type of input.
296 * @param string $special Use this mode for this one token.
299 function addSpecialPattern($pattern, $mode, $special) {
300 if (!isset($this->_regexes
[$mode])) {
301 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
303 $this->_regexes
[$mode]->addPattern($pattern, "_$special");
307 * Adds a mapping from a mode to another handler.
308 * @param string $mode Mode to be remapped.
309 * @param string $handler New target handler.
312 function mapHandler($mode, $handler) {
313 $this->_mode_handlers
[$mode] = $handler;
317 * Splits the page text into tokens. Will fail
318 * if the handlers report an error or if no
319 * content is consumed. If successful then each
320 * unparsed and parsed token invokes a call to the
322 * @param string $raw Raw HTML text.
323 * @return bool True on success, else false.
326 function parse($raw) {
327 if (!isset($this->_parser
)) {
330 $length = strlen($raw);
331 while (is_array($parsed = $this->_reduce($raw))) {
332 list($unmatched, $matched, $mode) = $parsed;
333 if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
336 if (strlen($raw) == $length) {
339 $length = strlen($raw);
344 return $this->_invokeParser($raw, LEXER_UNMATCHED
);
348 * Sends the matched token and any leading unmatched
349 * text to the parser changing the lexer to a new
350 * mode if one is listed.
351 * @param string $unmatched Unmatched leading portion.
352 * @param string $matched Actual token match.
353 * @param string $mode Mode after match. The "_exit"
354 * mode causes a stack pop. An
355 * false mode causes no change.
356 * @return bool False if there was any error
360 function _dispatchTokens($unmatched, $matched, $mode = false) {
361 if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED
)) {
364 if ($mode === "__exit") {
365 if (!$this->_invokeParser($matched, LEXER_EXIT
)) {
368 return $this->_mode
->leave();
370 if (strncmp($mode, "_", 1) == 0) {
371 $mode = substr($mode, 1);
372 $this->_mode
->enter($mode);
373 if (!$this->_invokeParser($matched, LEXER_SPECIAL
)) {
376 return $this->_mode
->leave();
378 if (is_string($mode)) {
379 $this->_mode
->enter($mode);
380 return $this->_invokeParser($matched, LEXER_ENTER
);
382 return $this->_invokeParser($matched, LEXER_MATCHED
);
386 * Calls the parser method named after the current
387 * mode. Empty content will be ignored.
388 * @param string $content Text parsed.
389 * @param string $is_match Token is recognised rather
390 * than unparsed data.
393 function _invokeParser($content, $is_match) {
394 if (($content === "") ||
($content === false)) {
397 $handler = $this->_mode
->getCurrent();
398 if (isset($this->_mode_handlers
[$handler])) {
399 $handler = $this->_mode_handlers
[$handler];
401 return $this->_parser
->$handler($content, $is_match);
405 * Tries to match a chunk of text and if successful
406 * removes the recognised chunk and any leading
407 * unparsed data. Empty strings will not be matched.
408 * @param string $raw The subject to parse. This is the
409 * content that will be eaten.
410 * @return bool|array Three item list of unparsed
411 * content followed by the
412 * recognised token and finally the
413 * action the parser is to take.
414 * True if no match, false if there
415 * is a parsing error.
418 function _reduce(&$raw) {
419 if (!isset($this->_regexes
[$this->_mode
->getCurrent()])) {
425 if ($action = $this->_regexes
[$this->_mode
->getCurrent()]->match($raw, $match)) {
426 $count = strpos($raw, $match);
427 $unparsed = substr($raw, 0, $count);
428 $raw = substr($raw, $count +
strlen($match));
429 return array($unparsed, $match, $action);