Mute STRICT errors from CSSTidy and don't run PEARSax3 on PHP 5.3.
[htmlpurifier.git] / tests / HTMLPurifier / LexerTest.php
blob1e5c04096b9750fc67cef4e291886ef8c20ae8ef
1 <?php
3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct() {
9 parent::__construct();
10 if ($GLOBALS['HTMLPurifierTest']['PEAR'] &&
11 // PEARSax3 is not maintained and throws loads of DEPRECATED
12 // errors in PHP 5.3
13 version_compare(PHP_VERSION, '5.3', '<')) {
14 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
15 $this->_has_pear = true;
17 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
18 require_once 'HTMLPurifier/Lexer/PH5P.php';
22 // HTMLPurifier_Lexer::create() --------------------------------------------
24 function test_create() {
25 $this->config->set('Core.MaintainLineNumbers', true);
26 $lexer = HTMLPurifier_Lexer::create($this->config);
27 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
30 function test_create_objectLexerImpl() {
31 $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
32 $lexer = HTMLPurifier_Lexer::create($this->config);
33 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
36 function test_create_unknownLexer() {
37 $this->config->set('Core.LexerImpl', 'AsdfAsdf');
38 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
39 HTMLPurifier_Lexer::create($this->config);
42 function test_create_incompatibleLexer() {
43 $this->config->set('Core.LexerImpl', 'DOMLex');
44 $this->config->set('Core.MaintainLineNumbers', true);
45 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
46 HTMLPurifier_Lexer::create($this->config);
49 // HTMLPurifier_Lexer->parseData() -----------------------------------------
51 function assertParseData($input, $expect = true) {
52 if ($expect === true) $expect = $input;
53 $lexer = new HTMLPurifier_Lexer();
54 $this->assertIdentical($expect, $lexer->parseData($input));
57 function test_parseData_plainText() {
58 $this->assertParseData('asdf');
61 function test_parseData_ampersandEntity() {
62 $this->assertParseData('&amp;', '&');
65 function test_parseData_quotEntity() {
66 $this->assertParseData('&quot;', '"');
69 function test_parseData_aposNumericEntity() {
70 $this->assertParseData('&#039;', "'");
73 function test_parseData_aposCompactNumericEntity() {
74 $this->assertParseData('&#39;', "'");
77 function test_parseData_adjacentAmpersandEntities() {
78 $this->assertParseData('&amp;&amp;&amp;', '&&&');
81 function test_parseData_trailingUnescapedAmpersand() {
82 $this->assertParseData('&amp;&', '&&');
85 function test_parseData_internalUnescapedAmpersand() {
86 $this->assertParseData('Procter & Gamble');
89 function test_parseData_improperEntityFaultToleranceTest() {
90 $this->assertParseData('&#x2D;');
93 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
95 function assertExtractBody($text, $extract = true) {
96 $lexer = new HTMLPurifier_Lexer();
97 $result = $lexer->extractBody($text);
98 if ($extract === true) $extract = $text;
99 $this->assertIdentical($extract, $result);
102 function test_extractBody_noBodyTags() {
103 $this->assertExtractBody('<b>Bold</b>');
106 function test_extractBody_lowercaseBodyTags() {
107 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
110 function test_extractBody_uppercaseBodyTags() {
111 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
114 function test_extractBody_realisticUseCase() {
115 $this->assertExtractBody(
116 '<?xml version="1.0"
117 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
118 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
119 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
120 <head>
121 <title>xyz</title>
122 </head>
123 <body>
124 <form method="post" action="whatever1">
125 <div>
126 <input type="text" name="username" />
127 <input type="text" name="password" />
128 <input type="submit" />
129 </div>
130 </form>
131 </body>
132 </html>',
134 <form method="post" action="whatever1">
135 <div>
136 <input type="text" name="username" />
137 <input type="text" name="password" />
138 <input type="submit" />
139 </div>
140 </form>
144 function test_extractBody_bodyWithAttributes() {
145 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
148 function test_extractBody_preserveUnclosedBody() {
149 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
152 function test_extractBody_useLastBody() {
153 $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
156 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
158 function assertTokenization($input, $expect, $alt_expect = array()) {
159 $lexers = array();
160 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
161 if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
162 if (class_exists('DOMDocument')) {
163 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
164 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
166 foreach ($lexers as $name => $lexer) {
167 $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
168 if (isset($alt_expect[$name])) {
169 if ($alt_expect[$name] === false) continue;
170 $t_expect = $alt_expect[$name];
171 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
172 } else {
173 $t_expect = $expect;
174 $this->assertIdentical($result, $expect, "$name: %s");
176 if ($t_expect != $result) {
177 printTokens($result);
182 function test_tokenizeHTML_emptyInput() {
183 $this->assertTokenization('', array());
186 function test_tokenizeHTML_plainText() {
187 $this->assertTokenization(
188 'This is regular text.',
189 array(
190 new HTMLPurifier_Token_Text('This is regular text.')
195 function test_tokenizeHTML_textAndTags() {
196 $this->assertTokenization(
197 'This is <b>bold</b> text',
198 array(
199 new HTMLPurifier_Token_Text('This is '),
200 new HTMLPurifier_Token_Start('b', array()),
201 new HTMLPurifier_Token_Text('bold'),
202 new HTMLPurifier_Token_End('b'),
203 new HTMLPurifier_Token_Text(' text'),
208 function test_tokenizeHTML_normalizeCase() {
209 $this->assertTokenization(
210 '<DIV>Totally rad dude. <b>asdf</b></div>',
211 array(
212 new HTMLPurifier_Token_Start('DIV', array()),
213 new HTMLPurifier_Token_Text('Totally rad dude. '),
214 new HTMLPurifier_Token_Start('b', array()),
215 new HTMLPurifier_Token_Text('asdf'),
216 new HTMLPurifier_Token_End('b'),
217 new HTMLPurifier_Token_End('div'),
222 function test_tokenizeHTML_notWellFormed() {
223 $this->assertTokenization(
224 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
225 array(
226 new HTMLPurifier_Token_Start('asdf'),
227 new HTMLPurifier_Token_End('asdf'),
228 new HTMLPurifier_Token_Start('d'),
229 new HTMLPurifier_Token_End('d'),
230 new HTMLPurifier_Token_Start('poOloka'),
231 new HTMLPurifier_Token_Start('poolasdf'),
232 new HTMLPurifier_Token_Start('ds'),
233 new HTMLPurifier_Token_End('asdf'),
234 new HTMLPurifier_Token_End('ASDF'),
236 array(
237 'DOMLex' => $alt = array(
238 new HTMLPurifier_Token_Empty('asdf'),
239 new HTMLPurifier_Token_Empty('d'),
240 new HTMLPurifier_Token_Start('pooloka'),
241 new HTMLPurifier_Token_Start('poolasdf'),
242 new HTMLPurifier_Token_Empty('ds'),
243 new HTMLPurifier_Token_End('poolasdf'),
244 new HTMLPurifier_Token_End('pooloka'),
246 'PH5P' => $alt,
251 function test_tokenizeHTML_whitespaceInTag() {
252 $this->assertTokenization(
253 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
254 array(
255 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
256 new HTMLPurifier_Token_Text('Link to '),
257 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
258 new HTMLPurifier_Token_Text('foobar'),
259 new HTMLPurifier_Token_End('b'),
260 new HTMLPurifier_Token_End('a'),
265 function test_tokenizeHTML_emptyTag() {
266 $this->assertTokenization(
267 '<br />',
268 array( new HTMLPurifier_Token_Empty('br') )
272 function test_tokenizeHTML_comment() {
273 $this->assertTokenization(
274 '<!-- Comment -->',
275 array( new HTMLPurifier_Token_Comment(' Comment ') )
279 function test_tokenizeHTML_malformedComment() {
280 $this->assertTokenization(
281 '<!-- not so well formed --->',
282 array( new HTMLPurifier_Token_Comment(' not so well formed -') )
286 function test_tokenizeHTML_unterminatedTag() {
287 $this->assertTokenization(
288 '<a href=""',
289 array( new HTMLPurifier_Token_Text('<a href=""') ),
290 array(
291 // I like our behavior better, but it's non-standard
292 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
293 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
294 'PH5P' => false, // total barfing, grabs scaffolding too
299 function test_tokenizeHTML_specialEntities() {
300 $this->assertTokenization(
301 '&lt;b&gt;',
302 array(
303 new HTMLPurifier_Token_Text('<b>')
305 array(
306 // some parsers will separate entities out
307 'PEARSax3' => $split = array(
308 new HTMLPurifier_Token_Text('<'),
309 new HTMLPurifier_Token_Text('b'),
310 new HTMLPurifier_Token_Text('>'),
312 'PH5P' => $split,
317 function test_tokenizeHTML_earlyQuote() {
318 $this->assertTokenization(
319 '<a "=>',
320 array( new HTMLPurifier_Token_Empty('a') ),
321 array(
322 // we barf on this input
323 'DirectLex' => $tokens = array(
324 new HTMLPurifier_Token_Start('a', array('"' => ''))
326 'PEARSax3' => $tokens,
327 'PH5P' => false, // behavior varies; handle this personally
332 function test_tokenizeHTML_earlyQuote_PH5P() {
333 if (!class_exists('DOMDocument')) return;
334 $lexer = new HTMLPurifier_Lexer_PH5P();
335 $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
336 if ($this->context->get('PH5PError', true)) {
337 $this->assertIdentical(array(
338 new HTMLPurifier_Token_Start('a', array('"' => ''))
339 ), $result);
340 } else {
341 $this->assertIdentical(array(
342 new HTMLPurifier_Token_Empty('a', array('"' => ''))
343 ), $result);
347 function test_tokenizeHTML_unescapedQuote() {
348 $this->assertTokenization(
349 '"',
350 array( new HTMLPurifier_Token_Text('"') )
354 function test_tokenizeHTML_escapedQuote() {
355 $this->assertTokenization(
356 '&quot;',
357 array( new HTMLPurifier_Token_Text('"') ),
358 array(
359 'PEARSax3' => false, // PEAR barfs on this
364 function test_tokenizeHTML_cdata() {
365 $this->assertTokenization(
366 '<![CDATA[You <b>can&#39;t</b> get me!]]>',
367 array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
368 array(
369 // PEAR splits up all of the CDATA
370 'PEARSax3' => $split = array(
371 new HTMLPurifier_Token_Text('You '),
372 new HTMLPurifier_Token_Text('<'),
373 new HTMLPurifier_Token_Text('b'),
374 new HTMLPurifier_Token_Text('>'),
375 new HTMLPurifier_Token_Text('can'),
376 new HTMLPurifier_Token_Text('&'),
377 new HTMLPurifier_Token_Text('#39;t'),
378 new HTMLPurifier_Token_Text('<'),
379 new HTMLPurifier_Token_Text('/b'),
380 new HTMLPurifier_Token_Text('>'),
381 new HTMLPurifier_Token_Text(' get me!'),
383 'PH5P' => $split,
388 function test_tokenizeHTML_characterEntity() {
389 $this->assertTokenization(
390 '&theta;',
391 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
395 function test_tokenizeHTML_characterEntityInCDATA() {
396 $this->assertTokenization(
397 '<![CDATA[&rarr;]]>',
398 array( new HTMLPurifier_Token_Text("&rarr;") ),
399 array(
400 'PEARSax3' => $split = array(
401 new HTMLPurifier_Token_Text('&'),
402 new HTMLPurifier_Token_Text('rarr;'),
404 'PH5P' => $split,
409 function test_tokenizeHTML_entityInAttribute() {
410 $this->assertTokenization(
411 '<a href="index.php?title=foo&amp;id=bar">Link</a>',
412 array(
413 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
414 new HTMLPurifier_Token_Text('Link'),
415 new HTMLPurifier_Token_End('a'),
420 function test_tokenizeHTML_preserveUTF8() {
421 $this->assertTokenization(
422 "\xCE\xB8",
423 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
427 function test_tokenizeHTML_specialEntityInAttribute() {
428 $this->assertTokenization(
429 '<br test="x &lt; 6" />',
430 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
434 function test_tokenizeHTML_emoticonProtection() {
435 $this->assertTokenization(
436 '<b>Whoa! <3 That\'s not good >.></b>',
437 array(
438 new HTMLPurifier_Token_Start('b'),
439 new HTMLPurifier_Token_Text('Whoa! '),
440 new HTMLPurifier_Token_Text('<'),
441 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
442 new HTMLPurifier_Token_End('b')
444 array(
445 // text is absorbed together
446 'DOMLex' => array(
447 new HTMLPurifier_Token_Start('b'),
448 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
449 new HTMLPurifier_Token_End('b'),
451 'PEARSax3' => false, // totally mangled
452 'PH5P' => array( // interesting grouping
453 new HTMLPurifier_Token_Start('b'),
454 new HTMLPurifier_Token_Text('Whoa! '),
455 new HTMLPurifier_Token_Text('<'),
456 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
457 new HTMLPurifier_Token_End('b'),
463 function test_tokenizeHTML_commentWithFunkyChars() {
464 $this->assertTokenization(
465 '<!-- This >< comment --><br />',
466 array(
467 new HTMLPurifier_Token_Comment(' This >< comment '),
468 new HTMLPurifier_Token_Empty('br'),
470 array(
471 'PEARSax3' => false,
476 function test_tokenizeHTML_unterminatedComment() {
477 $this->assertTokenization(
478 '<!-- This >< comment',
479 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
480 array(
481 'DOMLex' => false,
482 'PEARSax3' => false,
483 'PH5P' => false,
488 function test_tokenizeHTML_scriptCDATAContents() {
489 $this->config->set('HTML.Trusted', true);
490 $this->assertTokenization(
491 'Foo: <script>alert("<foo>");</script>',
492 array(
493 new HTMLPurifier_Token_Text('Foo: '),
494 new HTMLPurifier_Token_Start('script'),
495 new HTMLPurifier_Token_Text('alert("<foo>");'),
496 new HTMLPurifier_Token_End('script'),
498 array(
499 'PEARSax3' => false,
500 // PH5P, for some reason, bubbles the script to <head>
501 'PH5P' => false,
506 function test_tokenizeHTML_entitiesInComment() {
507 $this->assertTokenization(
508 '<!-- This comment < &lt; & -->',
509 array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
510 array(
511 'PEARSax3' => false
516 function test_tokenizeHTML_attributeWithSpecialCharacters() {
517 $this->assertTokenization(
518 '<a href="><>">',
519 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
520 array(
521 'DirectLex' => array(
522 new HTMLPurifier_Token_Start('a', array('href' => '')),
523 new HTMLPurifier_Token_Text('<'),
524 new HTMLPurifier_Token_Text('">'),
526 'PEARSax3' => false,
531 function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
532 $this->assertTokenization(
533 '<param name="src" value="http://example.com/video.wmv" />',
534 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
538 function test_tokenizeHTML_style() {
539 $extra = array(
540 // PH5P doesn't seem to like style tags
541 'PH5P' => false,
542 // DirectLex defers to RemoveForeignElements for textification
543 'DirectLex' => array(
544 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
545 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
546 new HTMLPurifier_Token_End('style'),
549 if (!defined('LIBXML_VERSION')) {
550 // LIBXML_VERSION is missing in early versions of PHP
551 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
552 // this translates to 5.0.x. In such cases, punt the test entirely.
553 return;
554 } elseif (LIBXML_VERSION < 20628) {
555 // libxml's behavior is wrong prior to this version, so make
556 // appropriate accomodations
557 $extra['DOMLex'] = $extra['DirectLex'];
559 $this->assertTokenization(
560 '<style type="text/css"><!--
561 div {}
562 --></style>',
563 array(
564 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
565 new HTMLPurifier_Token_Text("\ndiv {}\n"),
566 new HTMLPurifier_Token_End('style'),
568 $extra
572 function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
573 $alt_expect = array(
574 // Technically this is invalid, but it won't be a
575 // problem with invalid element removal; also, this
576 // mimics Mozilla's parsing of the tag.
577 new HTMLPurifier_Token_Start('a@'),
578 new HTMLPurifier_Token_Text('>'),
580 $this->assertTokenization(
581 '<a@>>',
582 array(
583 new HTMLPurifier_Token_Start('a'),
584 new HTMLPurifier_Token_Text('>'),
585 new HTMLPurifier_Token_End('a'),
587 array(
588 'DirectLex' => $alt_expect,
589 'PEARSax3' => $alt_expect,
594 function test_tokenizeHTML_emoticonHeart() {
595 $this->assertTokenization(
596 '<br /><3<br />',
597 array(
598 new HTMLPurifier_Token_Empty('br'),
599 new HTMLPurifier_Token_Text('<'),
600 new HTMLPurifier_Token_Text('3'),
601 new HTMLPurifier_Token_Empty('br'),
603 array(
604 'DOMLex' => array(
605 new HTMLPurifier_Token_Empty('br'),
606 new HTMLPurifier_Token_Text('<3'),
607 new HTMLPurifier_Token_Empty('br'),
609 'PEARSax3' => array(
610 // bah too lazy to fix this
611 new HTMLPurifier_Token_Empty('br'),
612 new HTMLPurifier_Token_Empty('3<br'),
618 function test_tokenizeHTML_emoticonShiftyEyes() {
619 $this->assertTokenization(
620 '<b><<</b>',
621 array(
622 new HTMLPurifier_Token_Start('b'),
623 new HTMLPurifier_Token_Text('<'),
624 new HTMLPurifier_Token_Text('<'),
625 new HTMLPurifier_Token_End('b'),
627 array(
628 'DOMLex' => array(
629 new HTMLPurifier_Token_Start('b'),
630 new HTMLPurifier_Token_Text('<<'),
631 new HTMLPurifier_Token_End('b'),
633 'PEARSax3' => array(
634 // also too lazy to fix
635 new HTMLPurifier_Token_Start('b'),
636 new HTMLPurifier_Token_Empty('<<'),
637 new HTMLPurifier_Token_Text('b>'),
643 function test_tokenizeHTML_eon1996() {
644 $this->assertTokenization(
645 '< <b>test</b>',
646 array(
647 new HTMLPurifier_Token_Text('<'),
648 new HTMLPurifier_Token_Text(' '),
649 new HTMLPurifier_Token_Start('b'),
650 new HTMLPurifier_Token_Text('test'),
651 new HTMLPurifier_Token_End('b'),
653 array(
654 'DOMLex' => array(
655 new HTMLPurifier_Token_Text('< '),
656 new HTMLPurifier_Token_Start('b'),
657 new HTMLPurifier_Token_Text('test'),
658 new HTMLPurifier_Token_End('b'),
660 'PEARSax3' => array(
661 // totally doing the wrong thing here
662 new HTMLPurifier_Token_Text(' '),
663 new HTMLPurifier_Token_Start('b'),
664 new HTMLPurifier_Token_Text('test'),
665 new HTMLPurifier_Token_End('b'),
671 function test_tokenizeHTML_bodyInCDATA() {
672 $alt_tokens = array(
673 new HTMLPurifier_Token_Text('<'),
674 new HTMLPurifier_Token_Text('body'),
675 new HTMLPurifier_Token_Text('>'),
676 new HTMLPurifier_Token_Text('Foo'),
677 new HTMLPurifier_Token_Text('<'),
678 new HTMLPurifier_Token_Text('/body'),
679 new HTMLPurifier_Token_Text('>'),
681 $this->assertTokenization(
682 '<![CDATA[<body>Foo</body>]]>',
683 array(
684 new HTMLPurifier_Token_Text('<body>Foo</body>'),
686 array(
687 'PH5P' => $alt_tokens,
688 'PEARSax3' => $alt_tokens,
693 function test_tokenizeHTML_() {
694 $this->assertTokenization(
695 '<a><img /></a>',
696 array(
697 new HTMLPurifier_Token_Start('a'),
698 new HTMLPurifier_Token_Empty('img'),
699 new HTMLPurifier_Token_End('a'),
706 function test_tokenizeHTML_() {
707 $this->assertTokenization(
709 array(
718 // vim: et sw=4 sts=4