3 class HTMLPurifier_LexerTest
extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct() {
10 // E_STRICT = 2048, int used for PHP4 compat: this check disables
11 // PEAR if PHP 5 strict mode is on, since the class is not strict safe
13 $GLOBALS['HTMLPurifierTest']['PEAR'] &&
14 ((error_reporting() & 2048) != 2048) // ought to be a better way
16 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
17 $this->_has_pear
= true;
19 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
20 require_once 'HTMLPurifier/Lexer/PH5P.php';
24 // HTMLPurifier_Lexer::create() --------------------------------------------
26 function test_create() {
27 $this->config
->set('Core.MaintainLineNumbers', true);
28 $lexer = HTMLPurifier_Lexer
::create($this->config
);
29 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
32 function test_create_objectLexerImpl() {
33 $this->config
->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
34 $lexer = HTMLPurifier_Lexer
::create($this->config
);
35 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
38 function test_create_unknownLexer() {
39 $this->config
->set('Core.LexerImpl', 'AsdfAsdf');
40 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
41 HTMLPurifier_Lexer
::create($this->config
);
44 function test_create_incompatibleLexer() {
45 $this->config
->set('Core.LexerImpl', 'DOMLex');
46 $this->config
->set('Core.MaintainLineNumbers', true);
47 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
48 HTMLPurifier_Lexer
::create($this->config
);
51 // HTMLPurifier_Lexer->parseData() -----------------------------------------
53 function assertParseData($input, $expect = true) {
54 if ($expect === true) $expect = $input;
55 $lexer = new HTMLPurifier_Lexer();
56 $this->assertIdentical($expect, $lexer->parseData($input));
59 function test_parseData_plainText() {
60 $this->assertParseData('asdf');
63 function test_parseData_ampersandEntity() {
64 $this->assertParseData('&', '&');
67 function test_parseData_quotEntity() {
68 $this->assertParseData('"', '"');
71 function test_parseData_aposNumericEntity() {
72 $this->assertParseData(''', "'");
75 function test_parseData_aposCompactNumericEntity() {
76 $this->assertParseData(''', "'");
79 function test_parseData_adjacentAmpersandEntities() {
80 $this->assertParseData('&&&', '&&&');
83 function test_parseData_trailingUnescapedAmpersand() {
84 $this->assertParseData('&&', '&&');
87 function test_parseData_internalUnescapedAmpersand() {
88 $this->assertParseData('Procter & Gamble');
91 function test_parseData_improperEntityFaultToleranceTest() {
92 $this->assertParseData('-');
95 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
97 function assertExtractBody($text, $extract = true) {
98 $lexer = new HTMLPurifier_Lexer();
99 $result = $lexer->extractBody($text);
100 if ($extract === true) $extract = $text;
101 $this->assertIdentical($extract, $result);
104 function test_extractBody_noBodyTags() {
105 $this->assertExtractBody('<b>Bold</b>');
108 function test_extractBody_lowercaseBodyTags() {
109 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
112 function test_extractBody_uppercaseBodyTags() {
113 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
116 function test_extractBody_realisticUseCase() {
117 $this->assertExtractBody(
119 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
120 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
121 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
126 <form method="post" action="whatever1">
128 <input type="text" name="username" />
129 <input type="text" name="password" />
130 <input type="submit" />
136 <form method="post" action="whatever1">
138 <input type="text" name="username" />
139 <input type="text" name="password" />
140 <input type="submit" />
146 function test_extractBody_bodyWithAttributes() {
147 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
150 function test_extractBody_preserveUnclosedBody() {
151 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
154 function test_extractBody_useLastBody() {
155 $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
158 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
160 function assertTokenization($input, $expect, $alt_expect = array()) {
162 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
163 if ($this->_has_pear
) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
164 if (class_exists('DOMDocument')) {
165 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
166 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
168 foreach ($lexers as $name => $lexer) {
169 $result = $lexer->tokenizeHTML($input, $this->config
, $this->context
);
170 if (isset($alt_expect[$name])) {
171 if ($alt_expect[$name] === false) continue;
172 $t_expect = $alt_expect[$name];
173 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
176 $this->assertIdentical($result, $expect, "$name: %s");
178 if ($t_expect != $result) {
179 printTokens($result);
185 function test_tokenizeHTML_emptyInput() {
186 $this->assertTokenization('', array());
189 function test_tokenizeHTML_plainText() {
190 $this->assertTokenization(
191 'This is regular text.',
193 new HTMLPurifier_Token_Text('This is regular text.')
198 function test_tokenizeHTML_textAndTags() {
199 $this->assertTokenization(
200 'This is <b>bold</b> text',
202 new HTMLPurifier_Token_Text('This is '),
203 new HTMLPurifier_Token_Start('b', array()),
204 new HTMLPurifier_Token_Text('bold'),
205 new HTMLPurifier_Token_End('b'),
206 new HTMLPurifier_Token_Text(' text'),
211 function test_tokenizeHTML_normalizeCase() {
212 $this->assertTokenization(
213 '<DIV>Totally rad dude. <b>asdf</b></div>',
215 new HTMLPurifier_Token_Start('DIV', array()),
216 new HTMLPurifier_Token_Text('Totally rad dude. '),
217 new HTMLPurifier_Token_Start('b', array()),
218 new HTMLPurifier_Token_Text('asdf'),
219 new HTMLPurifier_Token_End('b'),
220 new HTMLPurifier_Token_End('div'),
225 function test_tokenizeHTML_notWellFormed() {
226 $this->assertTokenization(
227 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
229 new HTMLPurifier_Token_Start('asdf'),
230 new HTMLPurifier_Token_End('asdf'),
231 new HTMLPurifier_Token_Start('d'),
232 new HTMLPurifier_Token_End('d'),
233 new HTMLPurifier_Token_Start('poOloka'),
234 new HTMLPurifier_Token_Start('poolasdf'),
235 new HTMLPurifier_Token_Start('ds'),
236 new HTMLPurifier_Token_End('asdf'),
237 new HTMLPurifier_Token_End('ASDF'),
240 'DOMLex' => $alt = array(
241 new HTMLPurifier_Token_Empty('asdf'),
242 new HTMLPurifier_Token_Empty('d'),
243 new HTMLPurifier_Token_Start('pooloka'),
244 new HTMLPurifier_Token_Start('poolasdf'),
245 new HTMLPurifier_Token_Empty('ds'),
246 new HTMLPurifier_Token_End('poolasdf'),
247 new HTMLPurifier_Token_End('pooloka'),
254 function test_tokenizeHTML_whitespaceInTag() {
255 $this->assertTokenization(
256 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
258 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
259 new HTMLPurifier_Token_Text('Link to '),
260 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
261 new HTMLPurifier_Token_Text('foobar'),
262 new HTMLPurifier_Token_End('b'),
263 new HTMLPurifier_Token_End('a'),
268 function test_tokenizeHTML_emptyTag() {
269 $this->assertTokenization(
271 array( new HTMLPurifier_Token_Empty('br') )
275 function test_tokenizeHTML_comment() {
276 $this->assertTokenization(
278 array( new HTMLPurifier_Token_Comment(' Comment ') ),
280 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
285 function test_tokenizeHTML_malformedComment() {
286 $this->assertTokenization(
287 '<!-- not so well formed --->',
288 array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
290 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
295 function test_tokenizeHTML_unterminatedTag() {
296 $this->assertTokenization(
298 array( new HTMLPurifier_Token_Text('<a href=""') ),
300 // I like our behavior better, but it's non-standard
301 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
302 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
303 'PH5P' => false, // total barfing, grabs scaffolding too
308 function test_tokenizeHTML_specialEntities() {
309 $this->assertTokenization(
312 new HTMLPurifier_Token_Text('<b>')
315 // some parsers will separate entities out
316 'PEARSax3' => $split = array(
317 new HTMLPurifier_Token_Text('<'),
318 new HTMLPurifier_Token_Text('b'),
319 new HTMLPurifier_Token_Text('>'),
326 function test_tokenizeHTML_earlyQuote() {
327 $this->assertTokenization(
329 array( new HTMLPurifier_Token_Empty('a') ),
331 // we barf on this input
332 'DirectLex' => $tokens = array(
333 new HTMLPurifier_Token_Start('a', array('"' => ''))
335 'PEARSax3' => $tokens,
336 'PH5P' => false, // behavior varies; handle this personally
341 function test_tokenizeHTML_earlyQuote_PH5P() {
342 if (!class_exists('DOMDocument')) return;
343 $lexer = new HTMLPurifier_Lexer_PH5P();
344 $result = $lexer->tokenizeHTML('<a "=>', $this->config
, $this->context
);
345 if ($this->context
->get('PH5PError', true)) {
346 $this->assertIdentical(array(
347 new HTMLPurifier_Token_Start('a', array('"' => ''))
350 $this->assertIdentical(array(
351 new HTMLPurifier_Token_Empty('a', array('"' => ''))
356 function test_tokenizeHTML_unescapedQuote() {
357 $this->assertTokenization(
359 array( new HTMLPurifier_Token_Text('"') )
363 function test_tokenizeHTML_escapedQuote() {
364 $this->assertTokenization(
366 array( new HTMLPurifier_Token_Text('"') ),
368 'PEARSax3' => false, // PEAR barfs on this
373 function test_tokenizeHTML_cdata() {
374 $this->assertTokenization(
375 '<![CDATA[You <b>can't</b> get me!]]>',
376 array( new HTMLPurifier_Token_Text('You <b>can't</b> get me!') ),
378 // PEAR splits up all of the CDATA
379 'PEARSax3' => $split = array(
380 new HTMLPurifier_Token_Text('You '),
381 new HTMLPurifier_Token_Text('<'),
382 new HTMLPurifier_Token_Text('b'),
383 new HTMLPurifier_Token_Text('>'),
384 new HTMLPurifier_Token_Text('can'),
385 new HTMLPurifier_Token_Text('&'),
386 new HTMLPurifier_Token_Text('#39;t'),
387 new HTMLPurifier_Token_Text('<'),
388 new HTMLPurifier_Token_Text('/b'),
389 new HTMLPurifier_Token_Text('>'),
390 new HTMLPurifier_Token_Text(' get me!'),
397 function test_tokenizeHTML_characterEntity() {
398 $this->assertTokenization(
400 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
404 function test_tokenizeHTML_characterEntityInCDATA() {
405 $this->assertTokenization(
406 '<![CDATA[→]]>',
407 array( new HTMLPurifier_Token_Text("→") ),
409 'PEARSax3' => $split = array(
410 new HTMLPurifier_Token_Text('&'),
411 new HTMLPurifier_Token_Text('rarr;'),
418 function test_tokenizeHTML_entityInAttribute() {
419 $this->assertTokenization(
420 '<a href="index.php?title=foo&id=bar">Link</a>',
422 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
423 new HTMLPurifier_Token_Text('Link'),
424 new HTMLPurifier_Token_End('a'),
429 function test_tokenizeHTML_preserveUTF8() {
430 $this->assertTokenization(
432 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
436 function test_tokenizeHTML_specialEntityInAttribute() {
437 $this->assertTokenization(
438 '<br test="x < 6" />',
439 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
443 function test_tokenizeHTML_emoticonProtection() {
444 $this->assertTokenization(
445 '<b>Whoa! <3 That\'s not good >.></b>',
447 new HTMLPurifier_Token_Start('b'),
448 new HTMLPurifier_Token_Text('Whoa! '),
449 new HTMLPurifier_Token_Text('<'),
450 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
451 new HTMLPurifier_Token_End('b')
454 // text is absorbed together
456 new HTMLPurifier_Token_Start('b'),
457 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
458 new HTMLPurifier_Token_End('b'),
460 'PEARSax3' => false, // totally mangled
461 'PH5P' => array( // interesting grouping
462 new HTMLPurifier_Token_Start('b'),
463 new HTMLPurifier_Token_Text('Whoa! '),
464 new HTMLPurifier_Token_Text('<'),
465 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
466 new HTMLPurifier_Token_End('b'),
472 function test_tokenizeHTML_commentWithFunkyChars() {
473 $this->assertTokenization(
474 '<!-- This >< comment --><br />',
476 new HTMLPurifier_Token_Comment(' This >< comment '),
477 new HTMLPurifier_Token_Empty('br'),
485 function test_tokenizeHTML_unterminatedComment() {
486 $this->assertTokenization(
487 '<!-- This >< comment',
488 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
497 function test_tokenizeHTML_scriptCDATAContents() {
498 $this->config
->set('HTML.Trusted', true);
499 $this->assertTokenization(
500 'Foo: <script>alert("<foo>");</script>',
502 new HTMLPurifier_Token_Text('Foo: '),
503 new HTMLPurifier_Token_Start('script'),
504 new HTMLPurifier_Token_Text('alert("<foo>");'),
505 new HTMLPurifier_Token_End('script'),
509 // PH5P, for some reason, bubbles the script to <head>
515 function test_tokenizeHTML_entitiesInComment() {
516 $this->assertTokenization(
517 '<!-- This comment < < & -->',
518 array( new HTMLPurifier_Token_Comment(' This comment < < & ') ),
525 function test_tokenizeHTML_attributeWithSpecialCharacters() {
526 $this->assertTokenization(
528 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
530 'DirectLex' => array(
531 new HTMLPurifier_Token_Start('a', array('href' => '')),
532 new HTMLPurifier_Token_Text('<'),
533 new HTMLPurifier_Token_Text('">'),
540 function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
541 $this->assertTokenization(
542 '<param name="src" value="http://example.com/video.wmv" />',
543 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
547 function test_tokenizeHTML_style() {
549 // PH5P doesn't seem to like style tags
551 // DirectLex defers to RemoveForeignElements for textification
552 'DirectLex' => array(
553 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
554 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
555 new HTMLPurifier_Token_End('style'),
558 if (!defined('LIBXML_VERSION')) {
559 // LIBXML_VERSION is missing in early versions of PHP
560 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
561 // this translates to 5.0.x. In such cases, punt the test entirely.
563 } elseif (LIBXML_VERSION
< 20628) {
564 // libxml's behavior is wrong prior to this version, so make
565 // appropriate accomodations
566 $extra['DOMLex'] = $extra['DirectLex'];
568 $this->assertTokenization(
569 '<style type="text/css"><!--
573 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
574 new HTMLPurifier_Token_Text("\ndiv {}\n"),
575 new HTMLPurifier_Token_End('style'),
581 function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
582 $this->assertTokenization(
585 new HTMLPurifier_Token_Start('a'),
586 new HTMLPurifier_Token_Text('>'),
587 new HTMLPurifier_Token_End('a'),
590 'DirectLex' => array(
591 // Technically this is invalid, but it won't be a
592 // problem with invalid element removal; also, this
593 // mimics Mozilla's parsing of the tag.
594 new HTMLPurifier_Token_Start('a@'),
595 new HTMLPurifier_Token_Text('>'),
601 function test_tokenizeHTML_emoticonHeart() {
602 $this->assertTokenization(
605 new HTMLPurifier_Token_Empty('br'),
606 new HTMLPurifier_Token_Text('<'),
607 new HTMLPurifier_Token_Text('3'),
608 new HTMLPurifier_Token_Empty('br'),
612 new HTMLPurifier_Token_Empty('br'),
613 new HTMLPurifier_Token_Text('<3'),
614 new HTMLPurifier_Token_Empty('br'),
620 function test_tokenizeHTML_emoticonShiftyEyes() {
621 $this->assertTokenization(
624 new HTMLPurifier_Token_Start('b'),
625 new HTMLPurifier_Token_Text('<'),
626 new HTMLPurifier_Token_Text('<'),
627 new HTMLPurifier_Token_End('b'),
631 new HTMLPurifier_Token_Start('b'),
632 new HTMLPurifier_Token_Text('<<'),
633 new HTMLPurifier_Token_End('b'),
639 function test_tokenizeHTML_eon1996() {
640 $this->assertTokenization(
643 new HTMLPurifier_Token_Text('<'),
644 new HTMLPurifier_Token_Text(' '),
645 new HTMLPurifier_Token_Start('b'),
646 new HTMLPurifier_Token_Text('test'),
647 new HTMLPurifier_Token_End('b'),
651 new HTMLPurifier_Token_Text('< '),
652 new HTMLPurifier_Token_Start('b'),
653 new HTMLPurifier_Token_Text('test'),
654 new HTMLPurifier_Token_End('b'),
660 function test_tokenizeHTML_bodyInCDATA() {
661 $this->assertTokenization(
662 '<![CDATA[<body>Foo</body>]]>',
664 new HTMLPurifier_Token_Text('<body>Foo</body>'),
668 new HTMLPurifier_Token_Text('<'),
669 new HTMLPurifier_Token_Text('body'),
670 new HTMLPurifier_Token_Text('>'),
671 new HTMLPurifier_Token_Text('Foo'),
672 new HTMLPurifier_Token_Text('<'),
673 new HTMLPurifier_Token_Text('/body'),
674 new HTMLPurifier_Token_Text('>'),
682 function test_tokenizeHTML_() {
683 $this->assertTokenization(
694 // vim: et sw=4 sts=4