[3.1.0] Fixed fatal error in PH5P lexer with invalid tag names
[htmlpurifier.git] / tests / HTMLPurifier / LexerTest.php
blob6425ca358b64817e414f202a1a457aeef902ff78
1 <?php
3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 function HTMLPurifier_LexerTest() {
9 parent::HTMLPurifier_Harness();
10 // E_STRICT = 2048, int used for PHP4 compat: this check disables
11 // PEAR if PHP 5 strict mode is on, since the class is not strict safe
12 if (
13 $GLOBALS['HTMLPurifierTest']['PEAR'] &&
14 ((error_reporting() & 2048) != 2048) // ought to be a better way
15 ) {
16 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
17 $this->_has_pear = true;
19 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
20 require_once 'HTMLPurifier/Lexer/PH5P.php';
24 // HTMLPurifier_Lexer::create() --------------------------------------------
26 function test_create() {
27 $this->config->set('Core', 'MaintainLineNumbers', true);
28 $lexer = HTMLPurifier_Lexer::create($this->config);
29 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
32 // HTMLPurifier_Lexer->parseData() -----------------------------------------
34 function assertParseData($input, $expect = true) {
35 if ($expect === true) $expect = $input;
36 $lexer = new HTMLPurifier_Lexer();
37 $this->assertIdentical($expect, $lexer->parseData($input));
40 function test_parseData_plainText() {
41 $this->assertParseData('asdf');
44 function test_parseData_ampersandEntity() {
45 $this->assertParseData('&amp;', '&');
48 function test_parseData_quotEntity() {
49 $this->assertParseData('&quot;', '"');
52 function test_parseData_aposNumericEntity() {
53 $this->assertParseData('&#039;', "'");
56 function test_parseData_aposCompactNumericEntity() {
57 $this->assertParseData('&#39;', "'");
60 function test_parseData_adjacentAmpersandEntities() {
61 $this->assertParseData('&amp;&amp;&amp;', '&&&');
64 function test_parseData_trailingUnescapedAmpersand() {
65 $this->assertParseData('&amp;&', '&&');
68 function test_parseData_internalUnescapedAmpersand() {
69 $this->assertParseData('Procter & Gamble');
72 function test_parseData_improperEntityFaultToleranceTest() {
73 $this->assertParseData('&#x2D;');
76 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
78 function assertExtractBody($text, $extract = true) {
79 $lexer = new HTMLPurifier_Lexer();
80 $result = $lexer->extractBody($text);
81 if ($extract === true) $extract = $text;
82 $this->assertIdentical($extract, $result);
85 function test_extractBody_noBodyTags() {
86 $this->assertExtractBody('<b>Bold</b>');
89 function test_extractBody_lowercaseBodyTags() {
90 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
93 function test_extractBody_uppercaseBodyTags() {
94 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
97 function test_extractBody_realisticUseCase() {
98 $this->assertExtractBody(
99 '<?xml version="1.0"
100 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
101 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
102 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
103 <head>
104 <title>xyz</title>
105 </head>
106 <body>
107 <form method="post" action="whatever1">
108 <div>
109 <input type="text" name="username" />
110 <input type="text" name="password" />
111 <input type="submit" />
112 </div>
113 </form>
114 </body>
115 </html>',
117 <form method="post" action="whatever1">
118 <div>
119 <input type="text" name="username" />
120 <input type="text" name="password" />
121 <input type="submit" />
122 </div>
123 </form>
127 function test_extractBody_bodyWithAttributes() {
128 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
131 function test_extractBody_preserveUnclosedBody() {
132 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
135 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
137 function assertTokenization($input, $expect, $alt_expect = array()) {
138 $lexers = array();
139 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
140 if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
141 if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
142 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
143 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
145 foreach ($lexers as $name => $lexer) {
146 $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
147 if (isset($alt_expect[$name])) {
148 if ($alt_expect[$name] === false) continue;
149 $t_expect = $alt_expect[$name];
150 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
151 } else {
152 $t_expect = $expect;
153 $this->assertIdentical($result, $expect, "$name: %s");
155 if ($t_expect != $result) {
156 printTokens($result);
157 //var_dump($result);
162 function test_tokenizeHTML_emptyInput() {
163 $this->assertTokenization('', array());
166 function test_tokenizeHTML_plainText() {
167 $this->assertTokenization(
168 'This is regular text.',
169 array(
170 new HTMLPurifier_Token_Text('This is regular text.')
175 function test_tokenizeHTML_textAndTags() {
176 $this->assertTokenization(
177 'This is <b>bold</b> text',
178 array(
179 new HTMLPurifier_Token_Text('This is '),
180 new HTMLPurifier_Token_Start('b', array()),
181 new HTMLPurifier_Token_Text('bold'),
182 new HTMLPurifier_Token_End('b'),
183 new HTMLPurifier_Token_Text(' text'),
188 function test_tokenizeHTML_normalizeCase() {
189 $this->assertTokenization(
190 '<DIV>Totally rad dude. <b>asdf</b></div>',
191 array(
192 new HTMLPurifier_Token_Start('DIV', array()),
193 new HTMLPurifier_Token_Text('Totally rad dude. '),
194 new HTMLPurifier_Token_Start('b', array()),
195 new HTMLPurifier_Token_Text('asdf'),
196 new HTMLPurifier_Token_End('b'),
197 new HTMLPurifier_Token_End('div'),
202 function test_tokenizeHTML_notWellFormed() {
203 $this->assertTokenization(
204 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
205 array(
206 new HTMLPurifier_Token_Start('asdf'),
207 new HTMLPurifier_Token_End('asdf'),
208 new HTMLPurifier_Token_Start('d'),
209 new HTMLPurifier_Token_End('d'),
210 new HTMLPurifier_Token_Start('poOloka'),
211 new HTMLPurifier_Token_Start('poolasdf'),
212 new HTMLPurifier_Token_Start('ds'),
213 new HTMLPurifier_Token_End('asdf'),
214 new HTMLPurifier_Token_End('ASDF'),
216 array(
217 'DOMLex' => $alt = array(
218 new HTMLPurifier_Token_Empty('asdf'),
219 new HTMLPurifier_Token_Empty('d'),
220 new HTMLPurifier_Token_Start('pooloka'),
221 new HTMLPurifier_Token_Start('poolasdf'),
222 new HTMLPurifier_Token_Empty('ds'),
223 new HTMLPurifier_Token_End('poolasdf'),
224 new HTMLPurifier_Token_End('pooloka'),
226 'PH5P' => $alt,
231 function test_tokenizeHTML_whitespaceInTag() {
232 $this->assertTokenization(
233 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
234 array(
235 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
236 new HTMLPurifier_Token_Text('Link to '),
237 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
238 new HTMLPurifier_Token_Text('foobar'),
239 new HTMLPurifier_Token_End('b'),
240 new HTMLPurifier_Token_End('a'),
245 function test_tokenizeHTML_emptyTag() {
246 $this->assertTokenization(
247 '<br />',
248 array( new HTMLPurifier_Token_Empty('br') )
252 function test_tokenizeHTML_comment() {
253 $this->assertTokenization(
254 '<!-- Comment -->',
255 array( new HTMLPurifier_Token_Comment(' Comment ') ),
256 array(
257 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
262 function test_tokenizeHTML_malformedComment() {
263 $this->assertTokenization(
264 '<!-- not so well formed --->',
265 array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
266 array(
267 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
272 function test_tokenizeHTML_unterminatedTag() {
273 $this->assertTokenization(
274 '<a href=""',
275 array( new HTMLPurifier_Token_Text('<a href=""') ),
276 array(
277 // I like our behavior better, but it's non-standard
278 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
279 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
280 'PH5P' => false, // total barfing, grabs scaffolding too
285 function test_tokenizeHTML_specialEntities() {
286 $this->assertTokenization(
287 '&lt;b&gt;',
288 array(
289 new HTMLPurifier_Token_Text('<b>')
291 array(
292 // some parsers will separate entities out
293 'PEARSax3' => $split = array(
294 new HTMLPurifier_Token_Text('<'),
295 new HTMLPurifier_Token_Text('b'),
296 new HTMLPurifier_Token_Text('>'),
298 'PH5P' => $split,
303 function test_tokenizeHTML_earlyQuote() {
304 $this->assertTokenization(
305 '<a "=>',
306 array( new HTMLPurifier_Token_Empty('a') ),
307 array(
308 // we barf on this input
309 'DirectLex' => $tokens = array(
310 new HTMLPurifier_Token_Start('a', array('"' => ''))
312 'PEARSax3' => $tokens,
313 'PH5P' => array(
314 new HTMLPurifier_Token_Empty('a', array('"' => ''))
320 function test_tokenizeHTML_unescapedQuote() {
321 $this->assertTokenization(
322 '"',
323 array( new HTMLPurifier_Token_Text('"') )
327 function test_tokenizeHTML_escapedQuote() {
328 $this->assertTokenization(
329 '&quot;',
330 array( new HTMLPurifier_Token_Text('"') ),
331 array(
332 'PEARSax3' => false, // PEAR barfs on this
337 function test_tokenizeHTML_cdata() {
338 $this->assertTokenization(
339 '<![CDATA[You <b>can&#39;t</b> get me!]]>',
340 array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
341 array(
342 // PEAR splits up all of the CDATA
343 'PEARSax3' => $split = array(
344 new HTMLPurifier_Token_Text('You '),
345 new HTMLPurifier_Token_Text('<'),
346 new HTMLPurifier_Token_Text('b'),
347 new HTMLPurifier_Token_Text('>'),
348 new HTMLPurifier_Token_Text('can'),
349 new HTMLPurifier_Token_Text('&'),
350 new HTMLPurifier_Token_Text('#39;t'),
351 new HTMLPurifier_Token_Text('<'),
352 new HTMLPurifier_Token_Text('/b'),
353 new HTMLPurifier_Token_Text('>'),
354 new HTMLPurifier_Token_Text(' get me!'),
356 'PH5P' => $split,
361 function test_tokenizeHTML_characterEntity() {
362 $this->assertTokenization(
363 '&theta;',
364 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
368 function test_tokenizeHTML_characterEntityInCDATA() {
369 $this->assertTokenization(
370 '<![CDATA[&rarr;]]>',
371 array( new HTMLPurifier_Token_Text("&rarr;") ),
372 array(
373 'PEARSax3' => $split = array(
374 new HTMLPurifier_Token_Text('&'),
375 new HTMLPurifier_Token_Text('rarr;'),
377 'PH5P' => $split,
382 function test_tokenizeHTML_entityInAttribute() {
383 $this->assertTokenization(
384 '<a href="index.php?title=foo&amp;id=bar">Link</a>',
385 array(
386 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
387 new HTMLPurifier_Token_Text('Link'),
388 new HTMLPurifier_Token_End('a'),
393 function test_tokenizeHTML_preserveUTF8() {
394 $this->assertTokenization(
395 "\xCE\xB8",
396 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
400 function test_tokenizeHTML_specialEntityInAttribute() {
401 $this->assertTokenization(
402 '<br test="x &lt; 6" />',
403 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
407 function test_tokenizeHTML_emoticonProtection() {
408 $this->config->set('Core', 'AggressivelyFixLt', true);
409 $this->assertTokenization(
410 '<b>Whoa! <3 That\'s not good >.></b>',
411 array(
412 new HTMLPurifier_Token_Start('b'),
413 new HTMLPurifier_Token_Text('Whoa! '),
414 new HTMLPurifier_Token_Text('<3 That\'s not good >'),
415 new HTMLPurifier_Token_Text('.>'),
416 new HTMLPurifier_Token_End('b')
418 array(
419 // text is absorbed together
420 'DOMLex' => array(
421 new HTMLPurifier_Token_Start('b'),
422 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
423 new HTMLPurifier_Token_End('b'),
425 'PEARSax3' => false, // totally mangled
426 'PH5P' => array( // interesting grouping
427 new HTMLPurifier_Token_Start('b'),
428 new HTMLPurifier_Token_Text('Whoa! '),
429 new HTMLPurifier_Token_Text('<'),
430 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
431 new HTMLPurifier_Token_End('b'),
437 function test_tokenizeHTML_commentWithFunkyChars() {
438 $this->assertTokenization(
439 '<!-- This >< comment --><br />',
440 array(
441 new HTMLPurifier_Token_Comment(' This >< comment '),
442 new HTMLPurifier_Token_Empty('br'),
444 array(
445 'PEARSax3' => false,
450 function test_tokenizeHTML_unterminatedComment() {
451 $this->assertTokenization(
452 '<!-- This >< comment',
453 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
454 array(
455 'DOMLex' => false,
456 'PEARSax3' => false,
457 'PH5P' => false,
462 function test_tokenizeHTML_scriptCDATAContents() {
463 $this->config->set('HTML', 'Trusted', true);
464 $this->assertTokenization(
465 'Foo: <script>alert("<foo>");</script>',
466 array(
467 new HTMLPurifier_Token_Text('Foo: '),
468 new HTMLPurifier_Token_Start('script'),
469 new HTMLPurifier_Token_Text('alert("<foo>");'),
470 new HTMLPurifier_Token_End('script'),
472 array(
473 'PEARSax3' => false,
474 // PH5P, for some reason, bubbles the script to <head>
475 'PH5P' => false,
480 function test_tokenizeHTML_entitiesInComment() {
481 $this->config->set('Core', 'AggressivelyFixLt', true);
482 $this->assertTokenization(
483 '<!-- This comment < &lt; & -->',
484 array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
485 array(
486 'PEARSax3' => false
491 function test_tokenizeHTML_attributeWithSpecialCharacters() {
492 $this->assertTokenization(
493 '<a href="><>">',
494 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
495 array(
496 'DirectLex' => array(
497 new HTMLPurifier_Token_Start('a', array('href' => '')),
498 new HTMLPurifier_Token_Text('<">'),
500 'PEARSax3' => false,
505 function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
506 $this->assertTokenization(
507 '<param name="src" value="http://example.com/video.wmv" />',
508 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
512 function test_tokenizeHTML_style() {
513 $extra = array(
514 // PH5P doesn't seem to like style tags
515 'PH5P' => false,
516 // DirectLex defers to RemoveForeignElements for textification
517 'DirectLex' => array(
518 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
519 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
520 new HTMLPurifier_Token_End('style'),
523 if (!defined('LIBXML_VERSION') || LIBXML_VERSION < 20628) {
524 // libxml's behavior is wrong prior to this version, so make
525 // appropriate accomodations
526 // :NOTE: LIBXML_VERSION is missing in early versions of PHP
527 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
528 // this translates to 5.0.x. In such cases, we assume that an old
529 // version of libxml is being used, although that *might* not
530 // be the case (it's very unlikely though)
531 $extra['DOMLex'] = $extra['DirectLex'];
533 $this->assertTokenization(
534 '<style type="text/css"><!--
535 div {}
536 --></style>',
537 array(
538 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
539 new HTMLPurifier_Token_Text("\ndiv {}\n"),
540 new HTMLPurifier_Token_End('style'),
542 $extra
546 function test_tokenizeHTML_() {
547 $this->assertTokenization(
548 '<a@>>',
549 array(
550 new HTMLPurifier_Token_Start('a'),
551 new HTMLPurifier_Token_Text('>'),
552 new HTMLPurifier_Token_End('a'),
554 array(
555 'DirectLex' => array(
556 // Technically this is invalid, but it won't be a
557 // problem with invalid element removal; also, this
558 // mimics Mozilla's parsing of the tag.
559 new HTMLPurifier_Token_Start('a@'),
560 new HTMLPurifier_Token_Text('>'),
568 function test_tokenizeHTML_() {
569 $this->assertTokenization(
571 array(