[3.1.0] Allow arbitrary whitespace in %HTML.Allowed
[htmlpurifier.git] / library / HTMLPurifier / HTMLDefinition.php
blobce299f79cee8f0e1f5279205eb942461fbef9f84
1 <?php
3 /**
4 * Definition of the purified HTML that describes allowed children,
5 * attributes, and many other things.
6 *
7 * Conventions:
8 *
9 * All member variables that are prefixed with info
10 * (including the main $info array) are used by HTML Purifier internals
11 * and should not be directly edited when customizing the HTMLDefinition.
12 * They can usually be set via configuration directives or custom
13 * modules.
15 * On the other hand, member variables without the info prefix are used
16 * internally by the HTMLDefinition and MUST NOT be used by other HTML
17 * Purifier internals. Many of them, however, are public, and may be
18 * edited by userspace code to tweak the behavior of HTMLDefinition.
20 * @note This class is inspected by Printer_HTMLDefinition; please
21 * update that class if things here change.
23 * @warning Directives that change this object's structure must be in
24 * the HTML or Attr namespace!
26 class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
29 // FULLY-PUBLIC VARIABLES ---------------------------------------------
31 /**
32 * Associative array of element names to HTMLPurifier_ElementDef
34 public $info = array();
36 /**
37 * Associative array of global attribute name to attribute definition.
39 public $info_global_attr = array();
41 /**
42 * String name of parent element HTML will be going into.
44 public $info_parent = 'div';
46 /**
47 * Definition for parent element, allows parent element to be a
48 * tag that's not allowed inside the HTML fragment.
50 public $info_parent_def;
52 /**
53 * String name of element used to wrap inline elements in block context
54 * @note This is rarely used except for BLOCKQUOTEs in strict mode
56 public $info_block_wrapper = 'p';
58 /**
59 * Associative array of deprecated tag name to HTMLPurifier_TagTransform
61 public $info_tag_transform = array();
63 /**
64 * Indexed list of HTMLPurifier_AttrTransform to be performed before validation.
66 public $info_attr_transform_pre = array();
68 /**
69 * Indexed list of HTMLPurifier_AttrTransform to be performed after validation.
71 public $info_attr_transform_post = array();
73 /**
74 * Nested lookup array of content set name (Block, Inline) to
75 * element name to whether or not it belongs in that content set.
77 public $info_content_sets = array();
79 /**
80 * Doctype object
82 public $doctype;
86 // RAW CUSTOMIZATION STUFF --------------------------------------------
88 /**
89 * Adds a custom attribute to a pre-existing element
90 * @note This is strictly convenience, and does not have a corresponding
91 * method in HTMLPurifier_HTMLModule
92 * @param $element_name String element name to add attribute to
93 * @param $attr_name String name of attribute
94 * @param $def Attribute definition, can be string or object, see
95 * HTMLPurifier_AttrTypes for details
97 public function addAttribute($element_name, $attr_name, $def) {
98 $module = $this->getAnonymousModule();
99 if (!isset($module->info[$element_name])) {
100 $element = $module->addBlankElement($element_name);
101 } else {
102 $element = $module->info[$element_name];
104 $element->attr[$attr_name] = $def;
108 * Adds a custom element to your HTML definition
109 * @note See HTMLPurifier_HTMLModule::addElement for detailed
110 * parameter and return value descriptions.
112 public function addElement($element_name, $type, $contents, $attr_collections, $attributes) {
113 $module = $this->getAnonymousModule();
114 // assume that if the user is calling this, the element
115 // is safe. This may not be a good idea
116 $element = $module->addElement($element_name, $type, $contents, $attr_collections, $attributes);
117 return $element;
121 * Adds a blank element to your HTML definition, for overriding
122 * existing behavior
123 * @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
124 * parameter and return value descriptions.
126 public function addBlankElement($element_name) {
127 $module = $this->getAnonymousModule();
128 $element = $module->addBlankElement($element_name);
129 return $element;
133 * Retrieves a reference to the anonymous module, so you can
134 * bust out advanced features without having to make your own
135 * module.
137 public function getAnonymousModule() {
138 if (!$this->_anonModule) {
139 $this->_anonModule = new HTMLPurifier_HTMLModule();
140 $this->_anonModule->name = 'Anonymous';
142 return $this->_anonModule;
145 private $_anonModule;
148 // PUBLIC BUT INTERNAL VARIABLES --------------------------------------
150 public $type = 'HTML';
151 public $manager; /**< Instance of HTMLPurifier_HTMLModuleManager */
154 * Performs low-cost, preliminary initialization.
156 public function __construct() {
157 $this->manager = new HTMLPurifier_HTMLModuleManager();
160 protected function doSetup($config) {
161 $this->processModules($config);
162 $this->setupConfigStuff($config);
163 unset($this->manager);
165 // cleanup some of the element definitions
166 foreach ($this->info as $k => $v) {
167 unset($this->info[$k]->content_model);
168 unset($this->info[$k]->content_model_type);
173 * Extract out the information from the manager
175 protected function processModules($config) {
177 if ($this->_anonModule) {
178 // for user specific changes
179 // this is late-loaded so we don't have to deal with PHP4
180 // reference wonky-ness
181 $this->manager->addModule($this->_anonModule);
182 unset($this->_anonModule);
185 $this->manager->setup($config);
186 $this->doctype = $this->manager->doctype;
188 foreach ($this->manager->modules as $module) {
189 foreach($module->info_tag_transform as $k => $v) {
190 if ($v === false) unset($this->info_tag_transform[$k]);
191 else $this->info_tag_transform[$k] = $v;
193 foreach($module->info_attr_transform_pre as $k => $v) {
194 if ($v === false) unset($this->info_attr_transform_pre[$k]);
195 else $this->info_attr_transform_pre[$k] = $v;
197 foreach($module->info_attr_transform_post as $k => $v) {
198 if ($v === false) unset($this->info_attr_transform_post[$k]);
199 else $this->info_attr_transform_post[$k] = $v;
203 $this->info = $this->manager->getElements();
204 $this->info_content_sets = $this->manager->contentSets->lookup;
209 * Sets up stuff based on config. We need a better way of doing this.
211 protected function setupConfigStuff($config) {
213 $block_wrapper = $config->get('HTML', 'BlockWrapper');
214 if (isset($this->info_content_sets['Block'][$block_wrapper])) {
215 $this->info_block_wrapper = $block_wrapper;
216 } else {
217 trigger_error('Cannot use non-block element as block wrapper',
218 E_USER_ERROR);
221 $parent = $config->get('HTML', 'Parent');
222 $def = $this->manager->getElement($parent, true);
223 if ($def) {
224 $this->info_parent = $parent;
225 $this->info_parent_def = $def;
226 } else {
227 trigger_error('Cannot use unrecognized element as parent',
228 E_USER_ERROR);
229 $this->info_parent_def = $this->manager->getElement($this->info_parent, true);
232 // support template text
233 $support = "(for information on implementing this, see the ".
234 "support forums) ";
236 // setup allowed elements -----------------------------------------
238 $allowed_elements = $config->get('HTML', 'AllowedElements');
239 $allowed_attributes = $config->get('HTML', 'AllowedAttributes'); // retrieve early
241 if (!is_array($allowed_elements) && !is_array($allowed_attributes)) {
242 $allowed = $config->get('HTML', 'Allowed');
243 if (is_string($allowed)) {
244 list($allowed_elements, $allowed_attributes) = $this->parseTinyMCEAllowedList($allowed);
248 if (is_array($allowed_elements)) {
249 foreach ($this->info as $name => $d) {
250 if(!isset($allowed_elements[$name])) unset($this->info[$name]);
251 unset($allowed_elements[$name]);
253 // emit errors
254 foreach ($allowed_elements as $element => $d) {
255 $element = htmlspecialchars($element); // PHP doesn't escape errors, be careful!
256 trigger_error("Element '$element' is not supported $support", E_USER_WARNING);
260 // setup allowed attributes ---------------------------------------
262 $allowed_attributes_mutable = $allowed_attributes; // by copy!
263 if (is_array($allowed_attributes)) {
265 // This actually doesn't do anything, since we went away from
266 // global attributes. It's possible that userland code uses
267 // it, but HTMLModuleManager doesn't!
268 foreach ($this->info_global_attr as $attr => $x) {
269 $keys = array($attr, "*@$attr", "*.$attr");
270 $delete = true;
271 foreach ($keys as $key) {
272 if ($delete && isset($allowed_attributes[$key])) {
273 $delete = false;
275 if (isset($allowed_attributes_mutable[$key])) {
276 unset($allowed_attributes_mutable[$key]);
279 if ($delete) unset($this->info_global_attr[$attr]);
282 foreach ($this->info as $tag => $info) {
283 foreach ($info->attr as $attr => $x) {
284 $keys = array("$tag@$attr", $attr, "*@$attr", "$tag.$attr", "*.$attr");
285 $delete = true;
286 foreach ($keys as $key) {
287 if ($delete && isset($allowed_attributes[$key])) {
288 $delete = false;
290 if (isset($allowed_attributes_mutable[$key])) {
291 unset($allowed_attributes_mutable[$key]);
294 if ($delete) unset($this->info[$tag]->attr[$attr]);
297 // emit errors
298 foreach ($allowed_attributes_mutable as $elattr => $d) {
299 $bits = preg_split('/[.@]/', $elattr, 2);
300 $c = count($bits);
301 switch ($c) {
302 case 2:
303 if ($bits[0] !== '*') {
304 $element = htmlspecialchars($bits[0]);
305 $attribute = htmlspecialchars($bits[1]);
306 if (!isset($this->info[$element])) {
307 trigger_error("Cannot allow attribute '$attribute' if element '$element' is not allowed/supported $support");
308 } else {
309 trigger_error("Attribute '$attribute' in element '$element' not supported $support",
310 E_USER_WARNING);
312 break;
314 // otherwise fall through
315 case 1:
316 $attribute = htmlspecialchars($bits[0]);
317 trigger_error("Global attribute '$attribute' is not ".
318 "supported in any elements $support",
319 E_USER_WARNING);
320 break;
326 // setup forbidden elements ---------------------------------------
328 $forbidden_elements = $config->get('HTML', 'ForbiddenElements');
329 $forbidden_attributes = $config->get('HTML', 'ForbiddenAttributes');
331 foreach ($this->info as $tag => $info) {
332 if (isset($forbidden_elements[$tag])) {
333 unset($this->info[$tag]);
334 continue;
336 foreach ($info->attr as $attr => $x) {
337 if (
338 isset($forbidden_attributes["$tag@$attr"]) ||
339 isset($forbidden_attributes["*@$attr"]) ||
340 isset($forbidden_attributes[$attr])
342 unset($this->info[$tag]->attr[$attr]);
343 continue;
344 } // this segment might get removed eventually
345 elseif (isset($forbidden_attributes["$tag.$attr"])) {
346 // $tag.$attr are not user supplied, so no worries!
347 trigger_error("Error with $tag.$attr: tag.attr syntax not supported for HTML.ForbiddenAttributes; use tag@attr instead", E_USER_WARNING);
351 foreach ($forbidden_attributes as $key => $v) {
352 if (strlen($key) < 2) continue;
353 if ($key[0] != '*') continue;
354 if ($key[1] == '.') {
355 trigger_error("Error with $key: *.attr syntax not supported for HTML.ForbiddenAttributes; use attr instead", E_USER_WARNING);
362 * Parses a TinyMCE-flavored Allowed Elements and Attributes list into
363 * separate lists for processing. Format is element[attr1|attr2],element2...
364 * @warning Although it's largely drawn from TinyMCE's implementation,
365 * it is different, and you'll probably have to modify your lists
366 * @param $list String list to parse
367 * @param array($allowed_elements, $allowed_attributes)
368 * @todo Give this its own class, probably static interface
370 public function parseTinyMCEAllowedList($list) {
372 $list = str_replace(array(' ', "\t"), '', $list);
374 $elements = array();
375 $attributes = array();
377 $chunks = preg_split('/(,|[\n\r]+)/', $list);
378 foreach ($chunks as $chunk) {
379 if (empty($chunk)) continue;
380 // remove TinyMCE element control characters
381 if (!strpos($chunk, '[')) {
382 $element = $chunk;
383 $attr = false;
384 } else {
385 list($element, $attr) = explode('[', $chunk);
387 if ($element !== '*') $elements[$element] = true;
388 if (!$attr) continue;
389 $attr = substr($attr, 0, strlen($attr) - 1); // remove trailing ]
390 $attr = explode('|', $attr);
391 foreach ($attr as $key) {
392 $attributes["$element.$key"] = true;
396 return array($elements, $attributes);