Release 1.6.1, merged in 931 to HEAD.
[htmlpurifier.git] / library / HTMLPurifier / HTMLModuleManager.php
blob81ef13a5f4263184f4fb725f35de9523d3a290f7
1 <?php
3 require_once 'HTMLPurifier/HTMLModule.php';
4 require_once 'HTMLPurifier/ElementDef.php';
6 require_once 'HTMLPurifier/ContentSets.php';
7 require_once 'HTMLPurifier/AttrTypes.php';
8 require_once 'HTMLPurifier/AttrCollections.php';
10 require_once 'HTMLPurifier/AttrDef.php';
11 require_once 'HTMLPurifier/AttrDef/Enum.php';
13 // W3C modules
14 require_once 'HTMLPurifier/HTMLModule/CommonAttributes.php';
15 require_once 'HTMLPurifier/HTMLModule/Text.php';
16 require_once 'HTMLPurifier/HTMLModule/Hypertext.php';
17 require_once 'HTMLPurifier/HTMLModule/List.php';
18 require_once 'HTMLPurifier/HTMLModule/Presentation.php';
19 require_once 'HTMLPurifier/HTMLModule/Edit.php';
20 require_once 'HTMLPurifier/HTMLModule/Bdo.php';
21 require_once 'HTMLPurifier/HTMLModule/Tables.php';
22 require_once 'HTMLPurifier/HTMLModule/Image.php';
23 require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php';
24 require_once 'HTMLPurifier/HTMLModule/Legacy.php';
25 require_once 'HTMLPurifier/HTMLModule/Target.php';
27 // proprietary modules
28 require_once 'HTMLPurifier/HTMLModule/TransformToStrict.php';
29 require_once 'HTMLPurifier/HTMLModule/TransformToXHTML11.php';
31 HTMLPurifier_ConfigSchema::define(
32 'HTML', 'Doctype', null, 'string/null',
33 'Doctype to use, valid values are HTML 4.01 Transitional, HTML 4.01 '.
34 'Strict, XHTML 1.0 Transitional, XHTML 1.0 Strict, XHTML 1.1. '.
35 'Technically speaking this is not actually a doctype (as it does '.
36 'not identify a corresponding DTD), but we are using this name '.
37 'for sake of simplicity. This will override any older directives '.
38 'like %Core.XHTML or %HTML.Strict.'
41 class HTMLPurifier_HTMLModuleManager
44 /**
45 * Array of HTMLPurifier_Module instances, indexed by module's class name.
46 * All known modules, regardless of use, are in this array.
48 var $modules = array();
50 /**
51 * String doctype we will validate against. See $validModules for use.
53 * @note
54 * There is a special doctype '*' that acts both as the "default"
55 * doctype if a customized system only defines one doctype and
56 * also a catch-all doctype that gets merged into all the other
57 * module collections. When possible, use a private collection to
58 * share modules between doctypes: this special doctype is to
59 * make life more convenient for users.
61 var $doctype;
62 var $doctypeAliases = array(); /**< Lookup array of strings to real doctypes */
64 /**
65 * Associative array: $collections[$type][$doctype] = list of modules.
66 * This is used to logically separate types of functionality so that
67 * based on the doctype and other configuration settings they may
68 * be easily switched and on and off. Custom setups may not need
69 * to use this abstraction, opting to have only one big collection
70 * with one valid doctype.
72 var $collections = array();
74 /**
75 * Modules that may be used in a valid doctype of this kind.
76 * Correctional and leniency modules should not be placed in this
77 * array unless the user said so: don't stuff every possible lenient
78 * module for this doctype in here.
80 var $validModules = array();
81 var $validCollections = array(); /**< Collections to merge into $validModules */
83 /**
84 * Modules that we will allow in input, subset of $validModules. Single
85 * element definitions may result in us consulting validModules.
87 var $activeModules = array();
88 var $activeCollections = array(); /**< Collections to merge into $activeModules */
90 var $counter = 0; /**< Designates next available integer order for modules. */
91 var $initialized = false; /**< Says whether initialize() was called */
93 /**
94 * Specifies what doctype to siphon new modules from addModule() to,
95 * or false to disable the functionality. Must be used in conjunction
96 * with $autoCollection.
98 var $autoDoctype = false;
99 /**
100 * Specifies what collection to siphon new modules from addModule() to,
101 * or false to disable the functionality. Must be used in conjunction
102 * with $autoCollection.
104 var $autoCollection = false;
106 /** Associative array of element name to defining modules (always array) */
107 var $elementLookup = array();
109 /** List of prefixes we should use for resolving small names */
110 var $prefixes = array('HTMLPurifier_HTMLModule_');
112 var $contentSets; /**< Instance of HTMLPurifier_ContentSets */
113 var $attrTypes; /**< Instance of HTMLPurifier_AttrTypes */
114 var $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
117 * @param $blank If true, don't do any initializing
119 function HTMLPurifier_HTMLModuleManager($blank = false) {
121 // the only editable internal object. The rest need to
122 // be manipulated through modules
123 $this->attrTypes = new HTMLPurifier_AttrTypes();
125 if (!$blank) $this->initialize();
129 function initialize() {
130 $this->initialized = true;
132 // load default modules to the recognized modules list (not active)
133 $modules = array(
134 // define
135 'CommonAttributes',
136 'Text', 'Hypertext', 'List', 'Presentation',
137 'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute',
138 'Target',
139 // define-redefine
140 'Legacy',
141 // redefine
142 'TransformToStrict', 'TransformToXHTML11'
144 foreach ($modules as $module) {
145 $this->addModule($module);
148 // Safe modules for supported doctypes. These are included
149 // in the valid and active module lists by default
150 $this->collections['Safe'] = array(
151 '_Common' => array( // leading _ indicates private
152 'CommonAttributes', 'Text', 'Hypertext', 'List',
153 'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
154 'StyleAttribute'
156 // HTML definitions, defer to XHTML definitions
157 'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
158 'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
159 // XHTML definitions
160 'XHTML 1.0 Transitional' => array( array('XHTML 1.0 Strict'), 'Legacy', 'Target' ),
161 'XHTML 1.0 Strict' => array(array('_Common')),
162 'XHTML 1.1' => array(array('_Common')),
165 // Modules that specify elements that are unsafe from untrusted
166 // third-parties. These should be registered in $validModules but
167 // almost never $activeModules unless you really know what you're
168 // doing.
169 $this->collections['Unsafe'] = array();
171 // Modules to import if lenient mode (attempt to convert everything
172 // to a valid representation) is on. These must not be in $validModules
173 // unless specified so.
174 $this->collections['Lenient'] = array(
175 'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
176 'XHTML 1.0 Strict' => array('TransformToStrict'),
177 'XHTML 1.1' => array(array('XHTML 1.0 Strict'), 'TransformToXHTML11')
180 // Modules to import if correctional mode (correct everything that
181 // is feasible to strict mode) is on. These must not be in $validModules
182 // unless specified so.
183 $this->collections['Correctional'] = array(
184 'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
185 'XHTML 1.0 Transitional' => array('TransformToStrict'), // probably want a different one
188 // User-space modules, custom code or whatever
189 $this->collections['Extension'] = array();
191 // setup active versus valid modules. ORDER IS IMPORTANT!
192 // definition modules
193 $this->makeCollectionActive('Safe');
194 $this->makeCollectionValid('Unsafe');
195 // redefinition modules
196 $this->makeCollectionActive('Lenient');
197 $this->makeCollectionActive('Correctional');
199 $this->autoDoctype = '*';
200 $this->autoCollection = 'Extension';
205 * Adds a module to the recognized module list. This does not
206 * do anything else: the module must be added to a corresponding
207 * collection to be "activated".
208 * @param $module Mixed: string module name, with or without
209 * HTMLPurifier_HTMLModule prefix, or instance of
210 * subclass of HTMLPurifier_HTMLModule.
211 * @note This function will not call autoload, you must instantiate
212 * (and thus invoke) autoload outside the method.
213 * @note If a string is passed as a module name, different variants
214 * will be tested in this order:
215 * - Check for HTMLPurifier_HTMLModule_$name
216 * - Check all prefixes with $name in order they were added
217 * - Check for literal object name
218 * - Throw fatal error
219 * If your object name collides with an internal class, specify
220 * your module manually.
222 function addModule($module) {
223 if (is_string($module)) {
224 $original_module = $module;
225 $ok = false;
226 foreach ($this->prefixes as $prefix) {
227 $module = $prefix . $original_module;
228 if ($this->_classExists($module)) {
229 $ok = true;
230 break;
233 if (!$ok) {
234 $module = $original_module;
235 if (!$this->_classExists($module)) {
236 trigger_error($original_module . ' module does not exist',
237 E_USER_ERROR);
238 return;
241 $module = new $module();
243 $module->order = $this->counter++; // assign then increment
244 $this->modules[$module->name] = $module;
245 if ($this->autoDoctype !== false && $this->autoCollection !== false) {
246 $this->collections[$this->autoCollection][$this->autoDoctype][] = $module->name;
251 * Safely tests for class existence without invoking __autoload in PHP5
252 * @param $name String class name to test
253 * @private
255 function _classExists($name) {
256 static $is_php_4 = null;
257 if ($is_php_4 === null) {
258 $is_php_4 = version_compare(PHP_VERSION, '5', '<');
260 if ($is_php_4) {
261 return class_exists($name);
262 } else {
263 return class_exists($name, false);
268 * Makes a collection active, while also making it valid if not
269 * already done so. See $activeModules for the semantics of "active".
270 * @param $collection_name Name of collection to activate
272 function makeCollectionActive($collection_name) {
273 if (!in_array($collection_name, $this->validCollections)) {
274 $this->makeCollectionValid($collection_name);
276 $this->activeCollections[] = $collection_name;
280 * Makes a collection valid. See $validModules for the semantics of "valid"
282 function makeCollectionValid($collection_name) {
283 $this->validCollections[] = $collection_name;
287 * Adds a class prefix that addModule() will use to resolve a
288 * string name to a concrete class
290 function addPrefix($prefix) {
291 $this->prefixes[] = (string) $prefix;
294 function setup($config) {
296 // load up the autocollection
297 if ($this->autoCollection !== false) {
298 $this->makeCollectionActive($this->autoCollection);
301 // retrieve the doctype
302 $this->doctype = $this->getDoctype($config);
303 if (isset($this->doctypeAliases[$this->doctype])) {
304 $this->doctype = $this->doctypeAliases[$this->doctype];
307 // process module collections to module name => module instance form
308 foreach ($this->collections as $col_i => $x) {
309 $this->processCollections($this->collections[$col_i]);
312 $this->validModules = $this->assembleModules($this->validCollections);
313 $this->activeModules = $this->assembleModules($this->activeCollections);
315 // setup lookup table based on all valid modules
316 foreach ($this->validModules as $module) {
317 foreach ($module->info as $name => $def) {
318 if (!isset($this->elementLookup[$name])) {
319 $this->elementLookup[$name] = array();
321 $this->elementLookup[$name][] = $module->name;
325 // note the different choice
326 $this->contentSets = new HTMLPurifier_ContentSets(
327 // content models that contain non-allowed elements are
328 // harmless because RemoveForeignElements will ensure
329 // they never get in anyway, and there is usually no
330 // reason why you should want to restrict a content
331 // model beyond what is mandated by the doctype.
332 // Note, however, that this means redefinitions of
333 // content models can't be tossed in validModels willy-nilly:
334 // that stuff still is regulated by configuration.
335 $this->validModules
337 $this->attrCollections = new HTMLPurifier_AttrCollections(
338 $this->attrTypes,
339 // only explicitly allowed modules are allowed to affect
340 // the global attribute collections. This mean's there's
341 // a distinction between loading the Bdo module, and the
342 // bdo element: Bdo will enable the dir attribute on all
343 // elements, while bdo will only define the bdo element,
344 // which will not have an editable directionality. This might
345 // catch people who are loading only elements by surprise, so
346 // we should consider loading an entire module if all the
347 // elements it defines are requested by the user, especially
348 // if it affects the global attribute collections.
349 $this->activeModules
355 * Takes a list of collections and merges together all the defined
356 * modules for the current doctype from those collections.
357 * @param $collections List of collection suffixes we should grab
358 * modules from (like 'Safe' or 'Lenient')
360 function assembleModules($collections) {
361 $modules = array();
362 $numOfCollectionsUsed = 0;
363 foreach ($collections as $name) {
364 $disable_global = false;
365 if (!isset($this->collections[$name])) {
366 trigger_error("$name collection is undefined", E_USER_ERROR);
367 continue;
369 $cols = $this->collections[$name];
370 if (isset($cols[$this->doctype])) {
371 if (isset($cols[$this->doctype]['*'])) {
372 unset($cols[$this->doctype]['*']);
373 $disable_global = true;
375 $modules += $cols[$this->doctype];
376 $numOfCollectionsUsed++;
378 // accept catch-all doctype
379 if (
380 $this->doctype !== '*' &&
381 isset($cols['*']) &&
382 !$disable_global
384 $modules += $cols['*'];
388 if ($numOfCollectionsUsed < 1) {
389 // possible XSS injection if user-specified doctypes
390 // are allowed
391 trigger_error("Doctype {$this->doctype} does not exist, ".
392 "check for typos (if you desire a doctype that allows ".
393 "no elements, use an empty array collection)", E_USER_ERROR);
395 return $modules;
399 * Takes a collection and performs inclusions and substitutions for it.
400 * @param $cols Reference to collections class member variable
402 function processCollections(&$cols) {
404 // $cols is the set of collections
405 // $col_i is the name (index) of a collection
406 // $col is a collection/list of modules
408 // perform inclusions
409 foreach ($cols as $col_i => $col) {
410 $seen = array();
411 if (!empty($col[0]) && is_array($col[0])) {
412 $seen[$col_i] = true; // recursion reporting
413 $includes = $col[0];
414 unset($cols[$col_i][0]); // remove inclusions value, recursion guard
415 } else {
416 $includes = array();
418 if (empty($includes)) continue;
419 for ($i = 0; isset($includes[$i]); $i++) {
420 $inc = $includes[$i];
421 if (isset($seen[$inc])) {
422 trigger_error(
423 "Circular inclusion detected in $col_i collection",
424 E_USER_ERROR
426 continue;
427 } else {
428 $seen[$inc] = true;
430 if (!isset($cols[$inc])) {
431 trigger_error(
432 "Collection $col_i tried to include undefined ".
433 "collection $inc", E_USER_ERROR);
434 continue;
436 foreach ($cols[$inc] as $module) {
437 if (is_array($module)) { // another inclusion!
438 foreach ($module as $inc2) $includes[] = $inc2;
439 continue;
441 $cols[$col_i][] = $module; // merge in the other modules
446 // replace with real modules, invert module from list to
447 // assoc array of module name to module instance
448 foreach ($cols as $col_i => $col) {
449 $ignore_global = false;
450 $order = array();
451 foreach ($col as $module_i => $module) {
452 unset($cols[$col_i][$module_i]);
453 if (is_array($module)) {
454 trigger_error("Illegal inclusion array at index".
455 " $module_i found collection $col_i, inclusion".
456 " arrays must be at start of collection (index 0)",
457 E_USER_ERROR);
458 continue;
460 if ($module_i === '*' && $module === false) {
461 $ignore_global = true;
462 continue;
464 if (!isset($this->modules[$module])) {
465 trigger_error(
466 "Collection $col_i references undefined ".
467 "module $module",
468 E_USER_ERROR
470 continue;
472 $module = $this->modules[$module];
473 $cols[$col_i][$module->name] = $module;
474 $order[$module->name] = $module->order;
476 array_multisort(
477 $order, SORT_ASC, SORT_NUMERIC, $cols[$col_i]
479 if ($ignore_global) $cols[$col_i]['*'] = false;
482 // delete pseudo-collections
483 foreach ($cols as $col_i => $col) {
484 if ($col_i[0] == '_') unset($cols[$col_i]);
490 * Retrieves the doctype from the configuration object
492 function getDoctype($config) {
493 $doctype = $config->get('HTML', 'Doctype');
494 if ($doctype !== null) {
495 return $doctype;
497 if (!$this->initialized) {
498 // don't do HTML-oriented backwards compatibility stuff
499 // use either the auto-doctype, or the catch-all doctype
500 return $this->autoDoctype ? $this->autoDoctype : '*';
502 // this is backwards-compatibility stuff
503 if ($config->get('Core', 'XHTML')) {
504 $doctype = 'XHTML 1.0';
505 } else {
506 $doctype = 'HTML 4.01';
508 if ($config->get('HTML', 'Strict')) {
509 $doctype .= ' Strict';
510 } else {
511 $doctype .= ' Transitional';
513 return $doctype;
517 * Retrieves merged element definitions for all active elements.
518 * @note We may want to generate an elements array during setup
519 * and pass that on, because a specific combination of
520 * elements may trigger the loading of a module.
521 * @param $config Instance of HTMLPurifier_Config, for determining
522 * stray elements.
524 function getElements($config) {
526 $elements = array();
527 foreach ($this->activeModules as $module) {
528 foreach ($module->info as $name => $v) {
529 if (isset($elements[$name])) continue;
530 $elements[$name] = $this->getElement($name, $config);
534 // standalone elements now loaded
536 return $elements;
541 * Retrieves a single merged element definition
542 * @param $name Name of element
543 * @param $config Instance of HTMLPurifier_Config, may not be necessary.
545 function getElement($name, $config) {
547 $def = false;
549 $modules = $this->validModules;
551 if (!isset($this->elementLookup[$name])) {
552 return false;
555 foreach($this->elementLookup[$name] as $module_name) {
557 $module = $modules[$module_name];
558 $new_def = $module->info[$name];
560 if (!$def && $new_def->standalone) {
561 $def = $new_def;
562 } elseif ($def) {
563 $def->mergeIn($new_def);
564 } else {
565 // could "save it for another day":
566 // non-standalone definitions that don't have a standalone
567 // to merge into could be deferred to the end
568 continue;
571 // attribute value expansions
572 $this->attrCollections->performInclusions($def->attr);
573 $this->attrCollections->expandIdentifiers($def->attr, $this->attrTypes);
575 // descendants_are_inline, for ChildDef_Chameleon
576 if (is_string($def->content_model) &&
577 strpos($def->content_model, 'Inline') !== false) {
578 if ($name != 'del' && $name != 'ins') {
579 // this is for you, ins/del
580 $def->descendants_are_inline = true;
584 $this->contentSets->generateChildDef($def, $module);
587 return $def;