[1.1.2]
[htmlpurifier.git] / library / HTMLPurifier / ChildDef.php
blob793ec51ab929786d9671d442510fe2359464c643
1 <?php
3 // HTMLPurifier_ChildDef and inheritance have three types of output:
4 // true = leave nodes as is
5 // false = delete parent node and all children
6 // array(...) = replace children nodes with these
8 HTMLPurifier_ConfigSchema::define(
9 'Core', 'EscapeInvalidChildren', false, 'bool',
10 'When true, a child is found that is not allowed in the context of the '.
11 'parent element will be transformed into text as if it were ASCII. When '.
12 'false, that element and all internal tags will be dropped, though text '.
13 'will be preserved. There is no option for dropping the element but '.
14 'preserving child nodes.'
17 /**
18 * Defines allowed child nodes and validates tokens against it.
20 class HTMLPurifier_ChildDef
22 /**
23 * Type of child definition, usually right-most part of class name lowercase
25 * Used occasionally in terms of context. Possible values include
26 * custom, required, optional and empty.
28 var $type;
30 /**
31 * Bool that indicates whether or not an empty array of children is okay
33 * This is necessary for redundant checking when changes affecting
34 * a child node may cause a parent node to now be disallowed.
36 var $allow_empty;
38 /**
39 * Validates nodes according to definition and returns modification.
41 * @warning $context is NOT HTMLPurifier_AttrContext
42 * @param $tokens_of_children Array of HTMLPurifier_Token
43 * @param $config HTMLPurifier_Config object
44 * @param $context String context indicating inline, block or unknown
45 * @return bool true to leave nodes as is
46 * @return bool false to remove parent node
47 * @return array of replacement child tokens
49 function validateChildren($tokens_of_children, $config, $context) {
50 trigger_error('Call to abstract function', E_USER_ERROR);
54 /**
55 * Custom validation class, accepts DTD child definitions
57 * @warning Currently this class is an all or nothing proposition, that is,
58 * it will only give a bool return value.
59 * @note This class is currently not used by any code, although it is unit
60 * tested.
62 class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
64 var $type = 'custom';
65 var $allow_empty = false;
66 /**
67 * Allowed child pattern as defined by the DTD
69 var $dtd_regex;
70 /**
71 * PCRE regex derived from $dtd_regex
72 * @private
74 var $_pcre_regex;
75 /**
76 * @param $dtd_regex Allowed child pattern from the DTD
78 function HTMLPurifier_ChildDef_Custom($dtd_regex) {
79 $this->dtd_regex = $dtd_regex;
80 $this->_compileRegex();
82 /**
83 * Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
85 function _compileRegex() {
86 $raw = str_replace(' ', '', $this->dtd_regex);
87 if ($raw{0} != '(') {
88 $raw = "($raw)";
90 $reg = str_replace(',', ',?', $raw);
91 $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
92 $this->_pcre_regex = $reg;
94 function validateChildren($tokens_of_children, $config, $context) {
95 $list_of_children = '';
96 $nesting = 0; // depth into the nest
97 foreach ($tokens_of_children as $token) {
98 if (!empty($token->is_whitespace)) continue;
100 $is_child = ($nesting == 0); // direct
102 if ($token->type == 'start') {
103 $nesting++;
104 } elseif ($token->type == 'end') {
105 $nesting--;
108 if ($is_child) {
109 $list_of_children .= $token->name . ',';
112 $list_of_children = rtrim($list_of_children, ',');
114 $okay =
115 preg_match(
116 '/^'.$this->_pcre_regex.'$/',
117 $list_of_children
120 return (bool) $okay;
125 * Definition that allows a set of elements, but disallows empty children.
127 class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
130 * Lookup table of allowed elements.
132 var $elements = array();
134 * @param $elements List of allowed element names (lowercase).
136 function HTMLPurifier_ChildDef_Required($elements) {
137 if (is_string($elements)) {
138 $elements = str_replace(' ', '', $elements);
139 $elements = explode('|', $elements);
141 $elements = array_flip($elements);
142 foreach ($elements as $i => $x) $elements[$i] = true;
143 $this->elements = $elements;
144 $this->gen = new HTMLPurifier_Generator();
146 var $allow_empty = false;
147 var $type = 'required';
148 function validateChildren($tokens_of_children, $config, $context) {
149 // if there are no tokens, delete parent node
150 if (empty($tokens_of_children)) return false;
152 // the new set of children
153 $result = array();
155 // current depth into the nest
156 $nesting = 0;
158 // whether or not we're deleting a node
159 $is_deleting = false;
161 // whether or not parsed character data is allowed
162 // this controls whether or not we silently drop a tag
163 // or generate escaped HTML from it
164 $pcdata_allowed = isset($this->elements['#PCDATA']);
166 // a little sanity check to make sure it's not ALL whitespace
167 $all_whitespace = true;
169 // some configuration
170 $escape_invalid_children = $config->get('Core', 'EscapeInvalidChildren');
172 foreach ($tokens_of_children as $token) {
173 if (!empty($token->is_whitespace)) {
174 $result[] = $token;
175 continue;
177 $all_whitespace = false; // phew, we're not talking about whitespace
179 $is_child = ($nesting == 0);
181 if ($token->type == 'start') {
182 $nesting++;
183 } elseif ($token->type == 'end') {
184 $nesting--;
187 if ($is_child) {
188 $is_deleting = false;
189 if (!isset($this->elements[$token->name])) {
190 $is_deleting = true;
191 if ($pcdata_allowed && $token->type == 'text') {
192 $result[] = $token;
193 } elseif ($pcdata_allowed && $escape_invalid_children) {
194 $result[] = new HTMLPurifier_Token_Text(
195 $this->gen->generateFromToken($token, $config)
198 continue;
201 if (!$is_deleting || ($pcdata_allowed && $token->type == 'text')) {
202 $result[] = $token;
203 } elseif ($pcdata_allowed && $escape_invalid_children) {
204 $result[] =
205 new HTMLPurifier_Token_Text(
206 $this->gen->generateFromToken( $token, $config )
208 } else {
209 // drop silently
212 if (empty($result)) return false;
213 if ($all_whitespace) return false;
214 if ($tokens_of_children == $result) return true;
215 return $result;
220 * Definition that allows a set of elements, and allows no children.
221 * @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
222 * really, one shouldn't inherit from the other. Only altered behavior
223 * is to overload a returned false with an array. Thus, it will never
224 * return false.
226 class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
228 var $allow_empty = true;
229 var $type = 'optional';
230 function validateChildren($tokens_of_children, $config, $context) {
231 $result = parent::validateChildren($tokens_of_children, $config, $context);
232 if ($result === false) return array();
233 return $result;
238 * Definition that disallows all elements.
239 * @warning validateChildren() in this class is actually never called, because
240 * empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
241 * before child definitions are parsed in earnest by
242 * HTMLPurifier_Strategy_FixNesting.
244 class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
246 var $allow_empty = true;
247 var $type = 'empty';
248 function HTMLPurifier_ChildDef_Empty() {}
249 function validateChildren($tokens_of_children, $config, $context) {
250 return array();
255 * Definition that uses different definitions depending on context.
257 * The del and ins tags are notable because they allow different types of
258 * elements depending on whether or not they're in a block or inline context.
259 * Chameleon allows this behavior to happen by using two different
260 * definitions depending on context. While this somewhat generalized,
261 * it is specifically intended for those two tags.
263 class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
267 * Instance of the definition object to use when inline. Usually stricter.
269 var $inline;
271 * Instance of the definition object to use when block.
273 var $block;
276 * @param $inline List of elements to allow when inline.
277 * @param $block List of elements to allow when block.
279 function HTMLPurifier_ChildDef_Chameleon($inline, $block) {
280 $this->inline = new HTMLPurifier_ChildDef_Optional($inline);
281 $this->block = new HTMLPurifier_ChildDef_Optional($block);
284 function validateChildren($tokens_of_children, $config, $context) {
285 switch ($context) {
286 case 'unknown':
287 case 'inline':
288 $result = $this->inline->validateChildren(
289 $tokens_of_children, $config, $context);
290 break;
291 case 'block':
292 $result = $this->block->validateChildren(
293 $tokens_of_children, $config, $context);
294 break;
295 default:
296 trigger_error('Invalid context', E_USER_ERROR);
297 return false;
299 return $result;
304 * Definition for tables
306 class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
308 var $allow_empty = false;
309 var $type = 'table';
310 function HTMLPurifier_ChildDef_Table() {}
311 function validateChildren($tokens_of_children, $config, $context) {
312 if (empty($tokens_of_children)) return false;
314 // this ensures that the loop gets run one last time before closing
315 // up. It's a little bit of a hack, but it works! Just make sure you
316 // get rid of the token later.
317 $tokens_of_children[] = false;
319 // only one of these elements is allowed in a table
320 $caption = false;
321 $thead = false;
322 $tfoot = false;
324 // as many of these as you want
325 $cols = array();
326 $content = array();
328 $nesting = 0; // current depth so we can determine nodes
329 $is_collecting = false; // are we globbing together tokens to package
330 // into one of the collectors?
331 $collection = array(); // collected nodes
332 $tag_index = 0; // the first node might be whitespace,
333 // so this tells us where the start tag is
335 foreach ($tokens_of_children as $token) {
336 $is_child = ($nesting == 0);
338 if ($token === false) {
339 // terminating sequence started
340 } elseif ($token->type == 'start') {
341 $nesting++;
342 } elseif ($token->type == 'end') {
343 $nesting--;
346 // handle node collection
347 if ($is_collecting) {
348 if ($is_child) {
349 // okay, let's stash the tokens away
350 // first token tells us the type of the collection
351 switch ($collection[$tag_index]->name) {
352 case 'tr':
353 case 'tbody':
354 $content[] = $collection;
355 break;
356 case 'caption':
357 if ($caption !== false) break;
358 $caption = $collection;
359 break;
360 case 'thead':
361 case 'tfoot':
362 // access the appropriate variable, $thead or $tfoot
363 $var = $collection[$tag_index]->name;
364 if ($$var === false) {
365 $$var = $collection;
366 } else {
367 // transmutate the first and less entries into
368 // tbody tags, and then put into content
369 $collection[$tag_index]->name = 'tbody';
370 $collection[count($collection)-1]->name = 'tbody';
371 $content[] = $collection;
373 break;
374 case 'colgroup':
375 $cols[] = $collection;
376 break;
378 $collection = array();
379 $is_collecting = false;
380 $tag_index = 0;
381 } else {
382 // add the node to the collection
383 $collection[] = $token;
387 // terminate
388 if ($token === false) break;
390 if ($is_child) {
391 // determine what we're dealing with
392 if ($token->name == 'col') {
393 // the only empty tag in the possie, we can handle it
394 // immediately
395 $cols[] = array_merge($collection, array($token));
396 $collection = array();
397 $tag_index = 0;
398 continue;
400 switch($token->name) {
401 case 'caption':
402 case 'colgroup':
403 case 'thead':
404 case 'tfoot':
405 case 'tbody':
406 case 'tr':
407 $is_collecting = true;
408 $collection[] = $token;
409 continue;
410 default:
411 if ($token->type == 'text' && $token->is_whitespace) {
412 $collection[] = $token;
413 $tag_index++;
415 continue;
420 if (empty($content)) return false;
422 $ret = array();
423 if ($caption !== false) $ret = array_merge($ret, $caption);
424 if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
425 if ($thead !== false) $ret = array_merge($ret, $thead);
426 if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
427 foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
428 if (!empty($collection) && $is_collecting == false){
429 // grab the trailing space
430 $ret = array_merge($ret, $collection);
433 array_pop($tokens_of_children); // remove phantom token
435 return ($ret === $tokens_of_children) ? true : $ret;