Add more documentation to HTMLDefinition in anticipation for refactoring.
[htmlpurifier.git] / library / HTMLPurifier / HTMLDefinition.php
blob9ef7d1c16cd8daf29e3b7515ffaf2665458e7b92
1 <?php
3 require_once 'HTMLPurifier/AttrDef.php';
4 require_once 'HTMLPurifier/AttrDef/Enum.php';
5 require_once 'HTMLPurifier/AttrDef/ID.php';
6 require_once 'HTMLPurifier/AttrDef/Class.php';
7 require_once 'HTMLPurifier/AttrDef/Text.php';
8 require_once 'HTMLPurifier/AttrDef/Lang.php';
9 require_once 'HTMLPurifier/AttrDef/Pixels.php';
10 require_once 'HTMLPurifier/AttrDef/Length.php';
11 require_once 'HTMLPurifier/AttrDef/MultiLength.php';
12 require_once 'HTMLPurifier/AttrDef/Integer.php';
13 require_once 'HTMLPurifier/AttrDef/URI.php';
14 require_once 'HTMLPurifier/AttrDef/CSS.php';
15 require_once 'HTMLPurifier/AttrTransform.php';
16 require_once 'HTMLPurifier/AttrTransform/Lang.php';
17 require_once 'HTMLPurifier/AttrTransform/TextAlign.php';
18 require_once 'HTMLPurifier/AttrTransform/BdoDir.php';
19 require_once 'HTMLPurifier/AttrTransform/ImgRequired.php';
20 require_once 'HTMLPurifier/ChildDef.php';
21 require_once 'HTMLPurifier/Generator.php';
22 require_once 'HTMLPurifier/Token.php';
23 require_once 'HTMLPurifier/TagTransform.php';
25 /**
26 * Defines the purified HTML type with large amounts of objects.
28 * The main function of this object is its $info array, which is an
29 * associative array of all the child and attribute definitions for
30 * each allowed element. It also contains special use information (always
31 * prefixed by info) for intelligent tag closing and global attributes.
33 * For optimization, the definition generation may be moved to
34 * a maintenance script and stipulate that definition be created
35 * by a factory method that unserializes a serialized version of Definition.
36 * Customization would entail copying the maintenance script, making the
37 * necessary changes, generating the serialized object, and then hooking it
38 * in via the factory method. We would also offer a LiveDefinition for
39 * automatic recompilation, suggesting that we would have a DefinitionGenerator.
42 class HTMLPurifier_HTMLDefinition
45 /**
46 * Associative array of element names to HTMLPurifier_ElementDef
47 * @public
49 var $info = array();
51 /**
52 * Associative array of global attribute name to attribute definition.
53 * @public
55 var $info_global_attr = array();
57 /**
58 * String name of parent element HTML will be going into.
59 * @todo Allow this to be overloaded by user config
60 * @public
62 var $info_parent = 'div';
64 /**
65 * Associative array of deprecated tag name to HTMLPurifier_TagTransform
66 * @public
68 var $info_tag_transform = array();
70 /**
71 * List of HTMLPurifier_AttrTransform to be performed before validation.
72 * @public
74 var $info_attr_transform_pre = array();
76 /**
77 * List of HTMLPurifier_AttrTransform to be performed after validation/
78 * @public
80 var $info_attr_transform_post = array();
82 /**
83 * Initializes the definition, the meat of the class.
85 function setup($config) {
87 // emulates the structure of the DTD
88 // these are condensed, however, with bad stuff taken out
89 // screening process was done by hand
91 //////////////////////////////////////////////////////////////////////
92 // info[] : initializes the definition objects
94 // if you attempt to define rules later on for a tag not in this array
95 // PHP will create an stdclass
97 $allowed_tags =
98 array(
99 'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong',
100 'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym',
101 'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's',
102 'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4',
103 'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr',
104 'pre', 'a', 'table', 'caption', 'thead', 'tfoot', 'tbody',
105 'colgroup', 'col', 'td', 'th', 'tr'
108 foreach ($allowed_tags as $tag) {
109 $this->info[$tag] = new HTMLPurifier_ElementDef();
112 //////////////////////////////////////////////////////////////////////
113 // info[]->child : defines allowed children for elements
115 // entities: prefixed with e_ and _ replaces . from DTD
116 // double underlines are entities we made up
118 // we don't use an array because that complicates interpolation
119 // strings are used instead of arrays because if you use arrays,
120 // you have to do some hideous manipulation with array_merge()
122 // todo: determine whether or not having allowed children
123 // that aren't allowed globally affects security (it shouldn't)
124 // if above works out, extend children definitions to include all
125 // possible elements (allowed elements will dictate which ones
126 // get dropped
128 $e_special_extra = 'img';
129 $e_special_basic = 'br | span | bdo';
130 $e_special = "$e_special_basic | $e_special_extra";
131 $e_fontstyle_extra = 'big | small';
132 $e_fontstyle_basic = 'tt | i | b | u | s | strike';
133 $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
134 $e_phrase_extra = 'sub | sup';
135 $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
136 ' | cite | abbr | acronym';
137 $e_phrase = "$e_phrase_basic | $e_phrase_extra";
138 $e_inline_forms = ''; // humor the dtd
139 $e_misc_inline = 'ins | del';
140 $e_misc = "$e_misc_inline";
141 $e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
142 " | $e_inline_forms";
143 // pseudo-property we created for convenience, see later on
144 $e__inline = "#PCDATA | $e_inline | $e_misc_inline";
145 // note the casing
146 $e_Inline = new HTMLPurifier_ChildDef_Optional($e__inline);
147 $e_heading = 'h1|h2|h3|h4|h5|h6';
148 $e_lists = 'ul | ol | dl';
149 $e_blocktext = 'pre | hr | blockquote | address';
150 $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
151 $e__flow = "#PCDATA | $e_block | $e_inline | $e_misc";
152 $e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow);
153 $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA".
154 " | $e_special | $e_fontstyle | $e_phrase | $e_inline_forms".
155 " | $e_misc_inline");
156 $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
157 " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
158 " | $e_inline_forms | $e_misc_inline");
159 $e_form_content = new HTMLPurifier_ChildDef_Optional('');//unused
160 $e_form_button_content = new HTMLPurifier_ChildDef_Optional('');//unused
162 $this->info['ins']->child =
163 $this->info['del']->child =
164 new HTMLPurifier_ChildDef_Chameleon($e__inline, $e__flow);
166 $this->info['blockquote']->child=
167 $this->info['dd']->child =
168 $this->info['li']->child =
169 $this->info['div']->child = $e_Flow;
171 $this->info['caption']->child =
172 $this->info['em']->child =
173 $this->info['strong']->child =
174 $this->info['dfn']->child =
175 $this->info['code']->child =
176 $this->info['samp']->child =
177 $this->info['kbd']->child =
178 $this->info['var']->child =
179 $this->info['cite']->child =
180 $this->info['abbr']->child =
181 $this->info['acronym']->child =
182 $this->info['q']->child =
183 $this->info['sub']->child =
184 $this->info['tt']->child =
185 $this->info['sup']->child =
186 $this->info['i']->child =
187 $this->info['b']->child =
188 $this->info['big']->child =
189 $this->info['small']->child=
190 $this->info['u']->child =
191 $this->info['s']->child =
192 $this->info['strike']->child =
193 $this->info['bdo']->child =
194 $this->info['span']->child =
195 $this->info['dt']->child =
196 $this->info['p']->child =
197 $this->info['h1']->child =
198 $this->info['h2']->child =
199 $this->info['h3']->child =
200 $this->info['h4']->child =
201 $this->info['h5']->child =
202 $this->info['h6']->child = $e_Inline;
204 // the only three required definitions, besides custom table code
205 $this->info['ol']->child =
206 $this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li');
208 $this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd');
210 $this->info['address']->child =
211 new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
212 " | $e_misc_inline");
214 $this->info['img']->child =
215 $this->info['br']->child =
216 $this->info['hr']->child = new HTMLPurifier_ChildDef_Empty();
218 $this->info['pre']->child = $e_pre_content;
220 $this->info['a']->child = $e_a_content;
222 $this->info['table']->child = new HTMLPurifier_ChildDef_Table();
224 // not a real entity, watch the double underscore
225 $e__row = new HTMLPurifier_ChildDef_Required('tr');
226 $this->info['thead']->child = $e__row;
227 $this->info['tfoot']->child = $e__row;
228 $this->info['tbody']->child = $e__row;
229 $this->info['colgroup']->child = new HTMLPurifier_ChildDef_Optional('col');
230 $this->info['col']->child = new HTMLPurifier_ChildDef_Empty();
231 $this->info['tr']->child = new HTMLPurifier_ChildDef_Required('th | td');
232 $this->info['th']->child = $e_Flow;
233 $this->info['td']->child = $e_Flow;
235 //////////////////////////////////////////////////////////////////////
236 // info[]->type : defines the type of the element (block or inline)
238 // reuses $e_Inline and $e_Block
240 foreach ($e_Inline->elements as $name) {
241 $this->info[$name]->type = 'inline';
244 $e_Block = new HTMLPurifier_ChildDef_Optional($e_block);
245 foreach ($e_Block->elements as $name) {
246 $this->info[$name]->type = 'block';
249 //////////////////////////////////////////////////////////////////////
250 // info[]->excludes : defines elements that aren't allowed in here
252 // make sure you test using isset() and not !empty()
254 $this->info['a']->excludes = array('a' => true);
255 $this->info['pre']->excludes = array_flip(array('img', 'big', 'small',
256 // technically useless, but good to be indepth
257 'object', 'applet', 'font', 'basefont'));
259 //////////////////////////////////////////////////////////////////////
260 // info[]->attr : defines allowed attributes for elements
262 // this doesn't include REQUIRED declarations, those are handled
263 // by the transform classes. It will, however, do simple and slightly
264 // complex attribute value substitution
266 // the question of varying allowed attributes is more entangling.
268 $e_Text = new HTMLPurifier_AttrDef_Text();
270 // attrs, included in almost every single one except for a few,
271 // which manually override these in their local definitions
272 $this->info_global_attr = array(
273 // core attrs
274 'id' => new HTMLPurifier_AttrDef_ID(),
275 'class' => new HTMLPurifier_AttrDef_Class(),
276 'title' => $e_Text,
277 'style' => new HTMLPurifier_AttrDef_CSS(),
278 // i18n
279 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false),
280 'lang' => new HTMLPurifier_AttrDef_Lang(),
281 'xml:lang' => new HTMLPurifier_AttrDef_Lang(),
284 // required attribute stipulation handled in attribute transformation
285 $this->info['bdo']->attr = array(); // nothing else
287 $this->info['br']->attr['dir'] = false;
288 $this->info['br']->attr['lang'] = false;
289 $this->info['br']->attr['xml:lang'] = false;
291 $this->info['td']->attr['abbr'] = $e_Text;
292 $this->info['th']->attr['abbr'] = $e_Text;
294 $this->setAttrForTableElements('align', new HTMLPurifier_AttrDef_Enum(
295 array('left', 'center', 'right', 'justify', 'char'), false));
297 $this->setAttrForTableElements('valign', new HTMLPurifier_AttrDef_Enum(
298 array('top', 'middle', 'bottom', 'baseline'), false));
300 $this->info['img']->attr['alt'] = $e_Text;
302 $e_TFrame = new HTMLPurifier_AttrDef_Enum(array('void', 'above',
303 'below', 'hsides', 'lhs', 'rhs', 'vsides', 'box', 'border'), false);
304 $this->info['table']->attr['frame'] = $e_TFrame;
306 $e_TRules = new HTMLPurifier_AttrDef_Enum(array('none', 'groups',
307 'rows', 'cols', 'all'), false);
308 $this->info['table']->attr['rules'] = $e_TRules;
310 $this->info['table']->attr['summary'] = $e_Text;
312 $this->info['table']->attr['border'] =
313 new HTMLPurifier_AttrDef_Pixels();
315 $e_Length = new HTMLPurifier_AttrDef_Length();
316 $this->info['table']->attr['cellpadding'] =
317 $this->info['table']->attr['cellspacing'] =
318 $this->info['table']->attr['width'] =
319 $this->info['img']->attr['height'] =
320 $this->info['img']->attr['width'] = $e_Length;
321 $this->setAttrForTableElements('charoff', $e_Length);
323 $e_MultiLength = new HTMLPurifier_AttrDef_MultiLength();
324 $this->info['col']->attr['width'] =
325 $this->info['colgroup']->attr['width'] = $e_MultiLength;
327 $e__NumberSpan = new HTMLPurifier_AttrDef_Integer(false, false, true);
328 $this->info['colgroup']->attr['span'] =
329 $this->info['col']->attr['span'] =
330 $this->info['td']->attr['rowspan'] =
331 $this->info['th']->attr['rowspan'] =
332 $this->info['td']->attr['colspan'] =
333 $this->info['th']->attr['colspan'] = $e__NumberSpan;
335 $e_URI = new HTMLPurifier_AttrDef_URI();
336 $this->info['a']->attr['href'] =
337 $this->info['img']->attr['longdesc'] =
338 $this->info['img']->attr['src'] =
339 $this->info['del']->attr['cite'] =
340 $this->info['ins']->attr['cite'] =
341 $this->info['blockquote']->attr['cite'] =
342 $this->info['q']->attr['cite'] = $e_URI;
344 //////////////////////////////////////////////////////////////////////
345 // info_tag_transform : transformations of tags
347 $this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font();
348 $this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
349 $this->info_tag_transform['dir'] = new HTMLPurifier_TagTransform_Simple('ul');
350 $this->info_tag_transform['center'] = new HTMLPurifier_TagTransform_Center();
352 //////////////////////////////////////////////////////////////////////
353 // info[]->auto_close : tags that automatically close another
355 // todo: determine whether or not SGML-like modeling based on
356 // mandatory/optional end tags would be a better policy
358 // make sure you test using isset() not !empty()
360 // these are all block elements: blocks aren't allowed in P
361 $this->info['p']->auto_close = array_flip(array(
362 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
363 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
364 'table', 'ul'
367 $this->info['li']->auto_close = array('li' => true);
369 // we need TABLE and heading mismatch code
370 // we may need to make this more flexible for heading mismatch,
371 // or we can just create another info
373 //////////////////////////////////////////////////////////////////////
374 // info[]->attr_transform_* : attribute transformations in elements
375 // pre is applied before any validation is done, post is done after
377 $this->info['h1']->attr_transform_pre[] =
378 $this->info['h2']->attr_transform_pre[] =
379 $this->info['h3']->attr_transform_pre[] =
380 $this->info['h4']->attr_transform_pre[] =
381 $this->info['h5']->attr_transform_pre[] =
382 $this->info['h6']->attr_transform_pre[] =
383 $this->info['p'] ->attr_transform_pre[] =
384 new HTMLPurifier_AttrTransform_TextAlign();
386 $this->info['bdo']->attr_transform_post[] =
387 new HTMLPurifier_AttrTransform_BdoDir();
389 $this->info['img']->attr_transform_post[] =
390 new HTMLPurifier_AttrTransform_ImgRequired();
392 //////////////////////////////////////////////////////////////////////
393 // info_attr_transform_* : global attribute transformation that is
394 // unconditionally called. Good for transformations that have complex
395 // start conditions
396 // pre is applied before any validation is done, post is done after
398 $this->info_attr_transform_post[] = new HTMLPurifier_AttrTransform_Lang();
402 function setAttrForTableElements($attr, $def) {
403 $this->info['col']->attr[$attr] =
404 $this->info['colgroup']->attr[$attr] =
405 $this->info['tbody']->attr[$attr] =
406 $this->info['td']->attr[$attr] =
407 $this->info['tfoot']->attr[$attr] =
408 $this->info['th']->attr[$attr] =
409 $this->info['thead']->attr[$attr] =
410 $this->info['tr']->attr[$attr] = $def;
416 * Structure that stores an element definition.
418 class HTMLPurifier_ElementDef
422 * Associative array of attribute name to HTMLPurifier_AttrDef
423 * @public
425 var $attr = array();
428 * List of tag's HTMLPurifier_AttrTransform to be done before validation
429 * @public
431 var $attr_transform_pre = array();
434 * List of tag's HTMLPurifier_AttrTransform to be done after validation
435 * @public
437 var $attr_transform_post = array();
440 * Lookup table of tags that close this tag.
441 * @public
443 var $auto_close = array();
446 * HTMLPurifier_ChildDef of this tag.
447 * @public
449 var $child;
452 * Type of the tag: inline or block or unknown?
453 * @public
455 var $type = 'unknown';
458 * Lookup table of tags excluded from all descendants of this tag.
459 * @public
461 var $excludes = array();