MDL-39416 do not try to get detailed perflog info before PAGE int
[moodle.git] / lib / simpletestlib / tidy_parser.php
blob3d8b4b2ac7dc8217dfbe929fd250430c6d892645
1 <?php
2 /**
3 * base include file for SimpleTest
4 * @package SimpleTest
5 * @subpackage WebTester
6 * @version $Id: php_parser.php 1911 2009-07-29 16:38:04Z lastcraft $
7 */
9 /**
10 * Builds the page object.
11 * @package SimpleTest
12 * @subpackage WebTester
14 class SimpleTidyPageBuilder {
15 private $page;
16 private $forms = array();
17 private $labels = array();
18 private $widgets_by_id = array();
20 public function __destruct() {
21 $this->free();
24 /**
25 * Frees up any references so as to allow the PHP garbage
26 * collection from unset() to work.
28 private function free() {
29 unset($this->page);
30 $this->forms = array();
31 $this->labels = array();
34 /**
35 * This builder is only available if the 'tidy' extension is loaded.
36 * @return boolean True if available.
38 function can() {
39 return extension_loaded('tidy');
42 /**
43 * Reads the raw content the page using HTML Tidy.
44 * @param $response SimpleHttpResponse Fetched response.
45 * @return SimplePage Newly parsed page.
47 function parse($response) {
48 $this->page = new SimplePage($response);
49 $tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()),
50 array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'),
51 'latin1');
52 $this->walkTree($tidied->html());
53 $this->attachLabels($this->widgets_by_id, $this->labels);
54 $this->page->setForms($this->forms);
55 $page = $this->page;
56 $this->free();
57 return $page;
60 /**
61 * Stops HTMLTidy stripping content that we wish to preserve.
62 * @param string The raw html.
63 * @return string The html with guard tags inserted.
65 private function insertGuards($html) {
66 return $this->insertEmptyTagGuards($this->insertTextareaSimpleWhitespaceGuards($html));
69 /**
70 * Removes the extra content added during the parse stage
71 * in order to preserve content we don't want stripped
72 * out by HTMLTidy.
73 * @param string The raw html.
74 * @return string The html with guard tags removed.
76 private function stripGuards($html) {
77 return $this->stripTextareaWhitespaceGuards($this->stripEmptyTagGuards($html));
80 /**
81 * HTML tidy strips out empty tags such as <option> which we
82 * need to preserve. This method inserts an additional marker.
83 * @param string The raw html.
84 * @return string The html with guards inserted.
86 private function insertEmptyTagGuards($html) {
87 return preg_replace('#<(option|textarea)([^>]*)>(\s*)</(option|textarea)>#is',
88 '<\1\2>___EMPTY___\3</\4>',
89 $html);
92 /**
93 * HTML tidy strips out empty tags such as <option> which we
94 * need to preserve. This method strips additional markers
95 * inserted by SimpleTest to the tidy output used to make the
96 * tags non-empty. This ensures their preservation.
97 * @param string The raw html.
98 * @return string The html with guards removed.
100 private function stripEmptyTagGuards($html) {
101 return preg_replace('#(^|>)(\s*)___EMPTY___(\s*)(</|$)#i', '\2\3', $html);
105 * By parsing the XML output of tidy, we lose some whitespace
106 * information in textarea tags. We temporarily recode this
107 * data ourselves so as not to lose it.
108 * @param string The raw html.
109 * @return string The html with guards inserted.
111 private function insertTextareaSimpleWhitespaceGuards($html) {
112 return preg_replace_callback('#<textarea([^>]*)>(.*?)</textarea>#is',
113 array($this, 'insertWhitespaceGuards'),
114 $html);
118 * Callback for insertTextareaSimpleWhitespaceGuards().
119 * @param array $matches Result of preg_replace_callback().
120 * @return string Guard tags now replace whitespace.
122 private function insertWhitespaceGuards($matches) {
123 return '<textarea' . $matches[1] . '>' .
124 str_replace(array("\n", "\r", "\t", ' '),
125 array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
126 $matches[2]) .
127 '</textarea>';
131 * Removes the whitespace preserving guards we added
132 * before parsing.
133 * @param string The raw html.
134 * @return string The html with guards removed.
136 private function stripTextareaWhitespaceGuards($html) {
137 return str_replace(array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
138 array("\n", "\r", "\t", ' '),
139 $html);
143 * Visits the given node and all children
144 * @param object $node Tidy XML node.
146 private function walkTree($node) {
147 if ($node->name == 'a') {
148 $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
149 ->addContent($this->innerHtml($node)));
150 } elseif ($node->name == 'base' and isset($node->attribute['href'])) {
151 $this->page->setBase($node->attribute['href']);
152 } elseif ($node->name == 'title') {
153 $this->page->setTitle($this->tags()->createTag($node->name, (array)$node->attribute)
154 ->addContent($this->innerHtml($node)));
155 } elseif ($node->name == 'frameset') {
156 $this->page->setFrames($this->collectFrames($node));
157 } elseif ($node->name == 'form') {
158 $this->forms[] = $this->walkForm($node, $this->createEmptyForm($node));
159 } elseif ($node->name == 'label') {
160 $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
161 ->addContent($this->innerHtml($node));
162 } else {
163 $this->walkChildren($node);
168 * Helper method for traversing the XML tree.
169 * @param object $node Tidy XML node.
171 private function walkChildren($node) {
172 if ($node->hasChildren()) {
173 foreach ($node->child as $child) {
174 $this->walkTree($child);
180 * Facade for forms containing preparsed widgets.
181 * @param object $node Tidy XML node.
182 * @return SimpleForm Facade for SimpleBrowser.
184 private function createEmptyForm($node) {
185 return new SimpleForm($this->tags()->createTag($node->name, (array)$node->attribute), $this->page);
189 * Visits the given node and all children
190 * @param object $node Tidy XML node.
192 private function walkForm($node, $form, $enclosing_label = '') {
193 if ($node->name == 'a') {
194 $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
195 ->addContent($this->innerHtml($node)));
196 } elseif (in_array($node->name, array('input', 'button', 'textarea', 'select'))) {
197 $this->addWidgetToForm($node, $form, $enclosing_label);
198 } elseif ($node->name == 'label') {
199 $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
200 ->addContent($this->innerHtml($node));
201 if ($node->hasChildren()) {
202 foreach ($node->child as $child) {
203 $this->walkForm($child, $form, SimplePage::normalise($this->innerHtml($node)));
206 } elseif ($node->hasChildren()) {
207 foreach ($node->child as $child) {
208 $this->walkForm($child, $form);
211 return $form;
215 * Tests a node for a "for" atribute. Used for
216 * attaching labels.
217 * @param object $node Tidy XML node.
218 * @return boolean True if the "for" attribute exists.
220 private function hasFor($node) {
221 return isset($node->attribute) and $node->attribute['for'];
225 * Adds the widget into the form container.
226 * @param object $node Tidy XML node of widget.
227 * @param SimpleForm $form Form to add it to.
228 * @param string $enclosing_label The label of any label
229 * tag we might be in.
231 private function addWidgetToForm($node, $form, $enclosing_label) {
232 $widget = $this->tags()->createTag($node->name, $this->attributes($node));
233 if (! $widget) {
234 return;
236 $widget->setLabel($enclosing_label)
237 ->addContent($this->innerHtml($node));
238 if ($node->name == 'select') {
239 $widget->addTags($this->collectSelectOptions($node));
241 $form->addWidget($widget);
242 $this->indexWidgetById($widget);
246 * Fills the widget cache to speed up searching.
247 * @param SimpleTag $widget Parsed widget to cache.
249 private function indexWidgetById($widget) {
250 $id = $widget->getAttribute('id');
251 if (! $id) {
252 return;
254 if (! isset($this->widgets_by_id[$id])) {
255 $this->widgets_by_id[$id] = array();
257 $this->widgets_by_id[$id][] = $widget;
261 * Parses the options from inside an XML select node.
262 * @param object $node Tidy XML node.
263 * @return array List of SimpleTag options.
265 private function collectSelectOptions($node) {
266 $options = array();
267 if ($node->name == 'option') {
268 $options[] = $this->tags()->createTag($node->name, $this->attributes($node))
269 ->addContent($this->innerHtml($node));
271 if ($node->hasChildren()) {
272 foreach ($node->child as $child) {
273 $options = array_merge($options, $this->collectSelectOptions($child));
276 return $options;
280 * Convenience method for collecting all the attributes
281 * of a tag. Not sure why Tidy does not have this.
282 * @param object $node Tidy XML node.
283 * @return array Hash of attribute strings.
285 private function attributes($node) {
286 if (! preg_match('|<[^ ]+\s(.*?)/?>|s', $node->value, $first_tag_contents)) {
287 return array();
289 $attributes = array();
290 preg_match_all('/\S+\s*=\s*\'[^\']*\'|(\S+\s*=\s*"[^"]*")|([^ =]+\s*=\s*[^ "\']+?)|[^ "\']+/', $first_tag_contents[1], $matches);
291 foreach($matches[0] as $unparsed) {
292 $attributes = $this->mergeAttribute($attributes, $unparsed);
294 return $attributes;
298 * Overlay an attribute into the attributes hash.
299 * @param array $attributes Current attribute list.
300 * @param string $raw Raw attribute string with
301 * both key and value.
302 * @return array New attribute hash.
304 private function mergeAttribute($attributes, $raw) {
305 $parts = explode('=', $raw);
306 list($name, $value) = count($parts) == 1 ? array($parts[0], $parts[0]) : $parts;
307 $attributes[trim($name)] = html_entity_decode($this->dequote(trim($value)), ENT_QUOTES);
308 return $attributes;
312 * Remove start and end quotes.
313 * @param string $quoted A quoted string.
314 * @return string Quotes are gone.
316 private function dequote($quoted) {
317 if (preg_match('/^(\'([^\']*)\'|"([^"]*)")$/', $quoted, $matches)) {
318 return isset($matches[3]) ? $matches[3] : $matches[2];
320 return $quoted;
324 * Collects frame information inside a frameset tag.
325 * @param object $node Tidy XML node.
326 * @return array List of SimpleTag frame descriptions.
328 private function collectFrames($node) {
329 $frames = array();
330 if ($node->name == 'frame') {
331 $frames = array($this->tags()->createTag($node->name, (array)$node->attribute));
332 } else if ($node->hasChildren()) {
333 $frames = array();
334 foreach ($node->child as $child) {
335 $frames = array_merge($frames, $this->collectFrames($child));
338 return $frames;
342 * Extracts the XML node text.
343 * @param object $node Tidy XML node.
344 * @return string The text only.
346 private function innerHtml($node) {
347 $raw = '';
348 if ($node->hasChildren()) {
349 foreach ($node->child as $child) {
350 $raw .= $child->value;
353 return $this->stripGuards($raw);
357 * Factory for parsed content holders.
358 * @return SimpleTagBuilder Factory.
360 private function tags() {
361 return new SimpleTagBuilder();
365 * Called at the end of a parse run. Attaches any
366 * non-wrapping labels to their form elements.
367 * @param array $widgets_by_id Cached SimpleTag hash.
368 * @param array $labels SimpleTag label elements.
370 private function attachLabels($widgets_by_id, $labels) {
371 foreach ($labels as $label) {
372 $for = $label->getFor();
373 if ($for and isset($widgets_by_id[$for])) {
374 $text = $label->getText();
375 foreach ($widgets_by_id[$for] as $widget) {
376 $widget->setLabel($text);