premier commit
[bazdig.git] / test / simpletest / parser.php
blob9a4f69491c9c8e814b51f27a62e63929cc9fffe5
1 <?php
2 /**
3 * base include file for SimpleTest
4 * @package SimpleTest
5 * @subpackage MockObjects
6 * @version $Id: parser.php,v 1.73 2006/11/21 00:26:55 lastcraft Exp $
7 */
9 /**#@+
10 * Lexer mode stack constants
12 if (! defined('LEXER_ENTER')) {
13 define('LEXER_ENTER', 1);
15 if (! defined('LEXER_MATCHED')) {
16 define('LEXER_MATCHED', 2);
18 if (! defined('LEXER_UNMATCHED')) {
19 define('LEXER_UNMATCHED', 3);
21 if (! defined('LEXER_EXIT')) {
22 define('LEXER_EXIT', 4);
24 if (! defined('LEXER_SPECIAL')) {
25 define('LEXER_SPECIAL', 5);
27 /**#@-*/
29 /**
30 * Compounded regular expression. Any of
31 * the contained patterns could match and
32 * when one does, it's label is returned.
33 * @package SimpleTest
34 * @subpackage WebTester
36 class ParallelRegex {
37 var $_patterns;
38 var $_labels;
39 var $_regex;
40 var $_case;
42 /**
43 * Constructor. Starts with no patterns.
44 * @param boolean $case True for case sensitive, false
45 * for insensitive.
46 * @access public
48 function ParallelRegex($case) {
49 $this->_case = $case;
50 $this->_patterns = array();
51 $this->_labels = array();
52 $this->_regex = null;
55 /**
56 * Adds a pattern with an optional label.
57 * @param string $pattern Perl style regex, but ( and )
58 * lose the usual meaning.
59 * @param string $label Label of regex to be returned
60 * on a match.
61 * @access public
63 function addPattern($pattern, $label = true) {
64 $count = count($this->_patterns);
65 $this->_patterns[$count] = $pattern;
66 $this->_labels[$count] = $label;
67 $this->_regex = null;
70 /**
71 * Attempts to match all patterns at once against
72 * a string.
73 * @param string $subject String to match against.
74 * @param string $match First matched portion of
75 * subject.
76 * @return boolean True on success.
77 * @access public
79 function match($subject, &$match) {
80 if (count($this->_patterns) == 0) {
81 return false;
83 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
84 $match = '';
85 return false;
87 $match = $matches[0];
88 for ($i = 1; $i < count($matches); $i++) {
89 if ($matches[$i]) {
90 return $this->_labels[$i - 1];
93 return true;
96 /**
97 * Compounds the patterns into a single
98 * regular expression separated with the
99 * "or" operator. Caches the regex.
100 * Will automatically escape (, ) and / tokens.
101 * @param array $patterns List of patterns in order.
102 * @access private
104 function _getCompoundedRegex() {
105 if ($this->_regex == null) {
106 for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) {
107 $this->_patterns[$i] = '(' . str_replace(
108 array('/', '(', ')'),
109 array('\/', '\(', '\)'),
110 $this->_patterns[$i]) . ')';
112 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
114 return $this->_regex;
118 * Accessor for perl regex mode flags to use.
119 * @return string Perl regex flags.
120 * @access private
122 function _getPerlMatchingFlags() {
123 return ($this->_case ? "msS" : "msSi");
128 * States for a stack machine.
129 * @package SimpleTest
130 * @subpackage WebTester
132 class SimpleStateStack {
133 var $_stack;
136 * Constructor. Starts in named state.
137 * @param string $start Starting state name.
138 * @access public
140 function SimpleStateStack($start) {
141 $this->_stack = array($start);
145 * Accessor for current state.
146 * @return string State.
147 * @access public
149 function getCurrent() {
150 return $this->_stack[count($this->_stack) - 1];
154 * Adds a state to the stack and sets it
155 * to be the current state.
156 * @param string $state New state.
157 * @access public
159 function enter($state) {
160 array_push($this->_stack, $state);
164 * Leaves the current state and reverts
165 * to the previous one.
166 * @return boolean False if we drop off
167 * the bottom of the list.
168 * @access public
170 function leave() {
171 if (count($this->_stack) == 1) {
172 return false;
174 array_pop($this->_stack);
175 return true;
180 * Accepts text and breaks it into tokens.
181 * Some optimisation to make the sure the
182 * content is only scanned by the PHP regex
183 * parser once. Lexer modes must not start
184 * with leading underscores.
185 * @package SimpleTest
186 * @subpackage WebTester
188 class SimpleLexer {
189 var $_regexes;
190 var $_parser;
191 var $_mode;
192 var $_mode_handlers;
193 var $_case;
196 * Sets up the lexer in case insensitive matching
197 * by default.
198 * @param SimpleSaxParser $parser Handling strategy by
199 * reference.
200 * @param string $start Starting handler.
201 * @param boolean $case True for case sensitive.
202 * @access public
204 function SimpleLexer(&$parser, $start = "accept", $case = false) {
205 $this->_case = $case;
206 $this->_regexes = array();
207 $this->_parser = &$parser;
208 $this->_mode = &new SimpleStateStack($start);
209 $this->_mode_handlers = array($start => $start);
213 * Adds a token search pattern for a particular
214 * parsing mode. The pattern does not change the
215 * current mode.
216 * @param string $pattern Perl style regex, but ( and )
217 * lose the usual meaning.
218 * @param string $mode Should only apply this
219 * pattern when dealing with
220 * this type of input.
221 * @access public
223 function addPattern($pattern, $mode = "accept") {
224 if (! isset($this->_regexes[$mode])) {
225 $this->_regexes[$mode] = new ParallelRegex($this->_case);
227 $this->_regexes[$mode]->addPattern($pattern);
228 if (! isset($this->_mode_handlers[$mode])) {
229 $this->_mode_handlers[$mode] = $mode;
234 * Adds a pattern that will enter a new parsing
235 * mode. Useful for entering parenthesis, strings,
236 * tags, etc.
237 * @param string $pattern Perl style regex, but ( and )
238 * lose the usual meaning.
239 * @param string $mode Should only apply this
240 * pattern when dealing with
241 * this type of input.
242 * @param string $new_mode Change parsing to this new
243 * nested mode.
244 * @access public
246 function addEntryPattern($pattern, $mode, $new_mode) {
247 if (! isset($this->_regexes[$mode])) {
248 $this->_regexes[$mode] = new ParallelRegex($this->_case);
250 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
251 if (! isset($this->_mode_handlers[$new_mode])) {
252 $this->_mode_handlers[$new_mode] = $new_mode;
257 * Adds a pattern that will exit the current mode
258 * and re-enter the previous one.
259 * @param string $pattern Perl style regex, but ( and )
260 * lose the usual meaning.
261 * @param string $mode Mode to leave.
262 * @access public
264 function addExitPattern($pattern, $mode) {
265 if (! isset($this->_regexes[$mode])) {
266 $this->_regexes[$mode] = new ParallelRegex($this->_case);
268 $this->_regexes[$mode]->addPattern($pattern, "__exit");
269 if (! isset($this->_mode_handlers[$mode])) {
270 $this->_mode_handlers[$mode] = $mode;
275 * Adds a pattern that has a special mode. Acts as an entry
276 * and exit pattern in one go, effectively calling a special
277 * parser handler for this token only.
278 * @param string $pattern Perl style regex, but ( and )
279 * lose the usual meaning.
280 * @param string $mode Should only apply this
281 * pattern when dealing with
282 * this type of input.
283 * @param string $special Use this mode for this one token.
284 * @access public
286 function addSpecialPattern($pattern, $mode, $special) {
287 if (! isset($this->_regexes[$mode])) {
288 $this->_regexes[$mode] = new ParallelRegex($this->_case);
290 $this->_regexes[$mode]->addPattern($pattern, "_$special");
291 if (! isset($this->_mode_handlers[$special])) {
292 $this->_mode_handlers[$special] = $special;
297 * Adds a mapping from a mode to another handler.
298 * @param string $mode Mode to be remapped.
299 * @param string $handler New target handler.
300 * @access public
302 function mapHandler($mode, $handler) {
303 $this->_mode_handlers[$mode] = $handler;
307 * Splits the page text into tokens. Will fail
308 * if the handlers report an error or if no
309 * content is consumed. If successful then each
310 * unparsed and parsed token invokes a call to the
311 * held listener.
312 * @param string $raw Raw HTML text.
313 * @return boolean True on success, else false.
314 * @access public
316 function parse($raw) {
317 if (! isset($this->_parser)) {
318 return false;
320 $length = strlen($raw);
321 while (is_array($parsed = $this->_reduce($raw))) {
322 list($raw, $unmatched, $matched, $mode) = $parsed;
323 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
324 return false;
326 if ($raw === '') {
327 return true;
329 if (strlen($raw) == $length) {
330 return false;
332 $length = strlen($raw);
334 if (! $parsed) {
335 return false;
337 return $this->_invokeParser($raw, LEXER_UNMATCHED);
341 * Sends the matched token and any leading unmatched
342 * text to the parser changing the lexer to a new
343 * mode if one is listed.
344 * @param string $unmatched Unmatched leading portion.
345 * @param string $matched Actual token match.
346 * @param string $mode Mode after match. A boolean
347 * false mode causes no change.
348 * @return boolean False if there was any error
349 * from the parser.
350 * @access private
352 function _dispatchTokens($unmatched, $matched, $mode = false) {
353 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
354 return false;
356 if (is_bool($mode)) {
357 return $this->_invokeParser($matched, LEXER_MATCHED);
359 if ($this->_isModeEnd($mode)) {
360 if (! $this->_invokeParser($matched, LEXER_EXIT)) {
361 return false;
363 return $this->_mode->leave();
365 if ($this->_isSpecialMode($mode)) {
366 $this->_mode->enter($this->_decodeSpecial($mode));
367 if (! $this->_invokeParser($matched, LEXER_SPECIAL)) {
368 return false;
370 return $this->_mode->leave();
372 $this->_mode->enter($mode);
373 return $this->_invokeParser($matched, LEXER_ENTER);
377 * Tests to see if the new mode is actually to leave
378 * the current mode and pop an item from the matching
379 * mode stack.
380 * @param string $mode Mode to test.
381 * @return boolean True if this is the exit mode.
382 * @access private
384 function _isModeEnd($mode) {
385 return ($mode === "__exit");
389 * Test to see if the mode is one where this mode
390 * is entered for this token only and automatically
391 * leaves immediately afterwoods.
392 * @param string $mode Mode to test.
393 * @return boolean True if this is the exit mode.
394 * @access private
396 function _isSpecialMode($mode) {
397 return (strncmp($mode, "_", 1) == 0);
401 * Strips the magic underscore marking single token
402 * modes.
403 * @param string $mode Mode to decode.
404 * @return string Underlying mode name.
405 * @access private
407 function _decodeSpecial($mode) {
408 return substr($mode, 1);
412 * Calls the parser method named after the current
413 * mode. Empty content will be ignored. The lexer
414 * has a parser handler for each mode in the lexer.
415 * @param string $content Text parsed.
416 * @param boolean $is_match Token is recognised rather
417 * than unparsed data.
418 * @access private
420 function _invokeParser($content, $is_match) {
421 if (($content === '') || ($content === false)) {
422 return true;
424 $handler = $this->_mode_handlers[$this->_mode->getCurrent()];
425 return $this->_parser->$handler($content, $is_match);
429 * Tries to match a chunk of text and if successful
430 * removes the recognised chunk and any leading
431 * unparsed data. Empty strings will not be matched.
432 * @param string $raw The subject to parse. This is the
433 * content that will be eaten.
434 * @return array/boolean Three item list of unparsed
435 * content followed by the
436 * recognised token and finally the
437 * action the parser is to take.
438 * True if no match, false if there
439 * is a parsing error.
440 * @access private
442 function _reduce($raw) {
443 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
444 $unparsed_character_count = strpos($raw, $match);
445 $unparsed = substr($raw, 0, $unparsed_character_count);
446 $raw = substr($raw, $unparsed_character_count + strlen($match));
447 return array($raw, $unparsed, $match, $action);
449 return true;
454 * Breas HTML into SAX events.
455 * @package SimpleTest
456 * @subpackage WebTester
458 class SimpleHtmlLexer extends SimpleLexer {
461 * Sets up the lexer with case insensitive matching
462 * and adds the HTML handlers.
463 * @param SimpleSaxParser $parser Handling strategy by
464 * reference.
465 * @access public
467 function SimpleHtmlLexer(&$parser) {
468 $this->SimpleLexer($parser, 'text');
469 $this->mapHandler('text', 'acceptTextToken');
470 $this->_addSkipping();
471 foreach ($this->_getParsedTags() as $tag) {
472 $this->_addTag($tag);
474 $this->_addInTagTokens();
478 * List of parsed tags. Others are ignored.
479 * @return array List of searched for tags.
480 * @access private
482 function _getParsedTags() {
483 return array('a', 'title', 'form', 'input', 'button', 'textarea', 'select',
484 'option', 'frameset', 'frame', 'label');
488 * The lexer has to skip certain sections such
489 * as server code, client code and styles.
490 * @access private
492 function _addSkipping() {
493 $this->mapHandler('css', 'ignore');
494 $this->addEntryPattern('<style', 'text', 'css');
495 $this->addExitPattern('</style>', 'css');
496 $this->mapHandler('js', 'ignore');
497 $this->addEntryPattern('<script', 'text', 'js');
498 $this->addExitPattern('</script>', 'js');
499 $this->mapHandler('comment', 'ignore');
500 $this->addEntryPattern('<!--', 'text', 'comment');
501 $this->addExitPattern('-->', 'comment');
505 * Pattern matches to start and end a tag.
506 * @param string $tag Name of tag to scan for.
507 * @access private
509 function _addTag($tag) {
510 $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken');
511 $this->addEntryPattern("<$tag", 'text', 'tag');
515 * Pattern matches to parse the inside of a tag
516 * including the attributes and their quoting.
517 * @access private
519 function _addInTagTokens() {
520 $this->mapHandler('tag', 'acceptStartToken');
521 $this->addSpecialPattern('\s+', 'tag', 'ignore');
522 $this->_addAttributeTokens();
523 $this->addExitPattern('/>', 'tag');
524 $this->addExitPattern('>', 'tag');
528 * Matches attributes that are either single quoted,
529 * double quoted or unquoted.
530 * @access private
532 function _addAttributeTokens() {
533 $this->mapHandler('dq_attribute', 'acceptAttributeToken');
534 $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute');
535 $this->addPattern("\\\\\"", 'dq_attribute');
536 $this->addExitPattern('"', 'dq_attribute');
537 $this->mapHandler('sq_attribute', 'acceptAttributeToken');
538 $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute');
539 $this->addPattern("\\\\'", 'sq_attribute');
540 $this->addExitPattern("'", 'sq_attribute');
541 $this->mapHandler('uq_attribute', 'acceptAttributeToken');
542 $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute');
547 * Converts HTML tokens into selected SAX events.
548 * @package SimpleTest
549 * @subpackage WebTester
551 class SimpleHtmlSaxParser {
552 var $_lexer;
553 var $_listener;
554 var $_tag;
555 var $_attributes;
556 var $_current_attribute;
559 * Sets the listener.
560 * @param SimpleSaxListener $listener SAX event handler.
561 * @access public
563 function SimpleHtmlSaxParser(&$listener) {
564 $this->_listener = &$listener;
565 $this->_lexer = &$this->createLexer($this);
566 $this->_tag = '';
567 $this->_attributes = array();
568 $this->_current_attribute = '';
572 * Runs the content through the lexer which
573 * should call back to the acceptors.
574 * @param string $raw Page text to parse.
575 * @return boolean False if parse error.
576 * @access public
578 function parse($raw) {
579 return $this->_lexer->parse($raw);
583 * Sets up the matching lexer. Starts in 'text' mode.
584 * @param SimpleSaxParser $parser Event generator, usually $self.
585 * @return SimpleLexer Lexer suitable for this parser.
586 * @access public
587 * @static
589 function &createLexer(&$parser) {
590 $lexer = &new SimpleHtmlLexer($parser);
591 return $lexer;
595 * Accepts a token from the tag mode. If the
596 * starting element completes then the element
597 * is dispatched and the current attributes
598 * set back to empty. The element or attribute
599 * name is converted to lower case.
600 * @param string $token Incoming characters.
601 * @param integer $event Lexer event type.
602 * @return boolean False if parse error.
603 * @access public
605 function acceptStartToken($token, $event) {
606 if ($event == LEXER_ENTER) {
607 $this->_tag = strtolower(substr($token, 1));
608 return true;
610 if ($event == LEXER_EXIT) {
611 $success = $this->_listener->startElement(
612 $this->_tag,
613 $this->_attributes);
614 $this->_tag = '';
615 $this->_attributes = array();
616 return $success;
618 if ($token != '=') {
619 $this->_current_attribute = strtolower(SimpleHtmlSaxParser::decodeHtml($token));
620 $this->_attributes[$this->_current_attribute] = '';
622 return true;
626 * Accepts a token from the end tag mode.
627 * The element name is converted to lower case.
628 * @param string $token Incoming characters.
629 * @param integer $event Lexer event type.
630 * @return boolean False if parse error.
631 * @access public
633 function acceptEndToken($token, $event) {
634 if (! preg_match('/<\/(.*)>/', $token, $matches)) {
635 return false;
637 return $this->_listener->endElement(strtolower($matches[1]));
641 * Part of the tag data.
642 * @param string $token Incoming characters.
643 * @param integer $event Lexer event type.
644 * @return boolean False if parse error.
645 * @access public
647 function acceptAttributeToken($token, $event) {
648 if ($this->_current_attribute) {
649 if ($event == LEXER_UNMATCHED) {
650 $this->_attributes[$this->_current_attribute] .=
651 SimpleHtmlSaxParser::decodeHtml($token);
653 if ($event == LEXER_SPECIAL) {
654 $this->_attributes[$this->_current_attribute] .=
655 preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser::decodeHtml($token));
658 return true;
662 * A character entity.
663 * @param string $token Incoming characters.
664 * @param integer $event Lexer event type.
665 * @return boolean False if parse error.
666 * @access public
668 function acceptEntityToken($token, $event) {
672 * Character data between tags regarded as
673 * important.
674 * @param string $token Incoming characters.
675 * @param integer $event Lexer event type.
676 * @return boolean False if parse error.
677 * @access public
679 function acceptTextToken($token, $event) {
680 return $this->_listener->addContent($token);
684 * Incoming data to be ignored.
685 * @param string $token Incoming characters.
686 * @param integer $event Lexer event type.
687 * @return boolean False if parse error.
688 * @access public
690 function ignore($token, $event) {
691 return true;
695 * Decodes any HTML entities.
696 * @param string $html Incoming HTML.
697 * @return string Outgoing plain text.
698 * @access public
699 * @static
701 function decodeHtml($html) {
702 static $translations;
703 if (! isset($translations)) {
704 $translations = array_flip(get_html_translation_table(HTML_ENTITIES));
706 return strtr($html, $translations);
710 * Turns HTML into text browser visible text. Images
711 * are converted to their alt text and tags are supressed.
712 * Entities are converted to their visible representation.
713 * @param string $html HTML to convert.
714 * @return string Plain text.
715 * @access public
716 * @static
718 function normalise($html) {
719 $text = preg_replace('|<!--.*?-->|', '', $html);
720 $text = preg_replace('|<img.*?alt\s*=\s*"(.*?)".*?>|', ' \1 ', $text);
721 $text = preg_replace('|<img.*?alt\s*=\s*\'(.*?)\'.*?>|', ' \1 ', $text);
722 $text = preg_replace('|<img.*?alt\s*=\s*([a-zA-Z_]+).*?>|', ' \1 ', $text);
723 $text = preg_replace('|<.*?>|', '', $text);
724 $text = SimpleHtmlSaxParser::decodeHtml($text);
725 $text = preg_replace('|\s+|', ' ', $text);
726 return trim($text);
731 * SAX event handler.
732 * @package SimpleTest
733 * @subpackage WebTester
734 * @abstract
736 class SimpleSaxListener {
739 * Sets the document to write to.
740 * @access public
742 function SimpleSaxListener() {
746 * Start of element event.
747 * @param string $name Element name.
748 * @param hash $attributes Name value pairs.
749 * Attributes without content
750 * are marked as true.
751 * @return boolean False on parse error.
752 * @access public
754 function startElement($name, $attributes) {
758 * End of element event.
759 * @param string $name Element name.
760 * @return boolean False on parse error.
761 * @access public
763 function endElement($name) {
767 * Unparsed, but relevant data.
768 * @param string $text May include unparsed tags.
769 * @return boolean False on parse error.
770 * @access public
772 function addContent($text) {