Add vim modelines to all files.
[htmlpurifier.git] / tests / HTMLPurifier / LexerTest.php
blob3d9c0309008c08c2fb898694124e44af195d2d10
1 <?php
3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct() {
9 parent::__construct();
10 // E_STRICT = 2048, int used for PHP4 compat: this check disables
11 // PEAR if PHP 5 strict mode is on, since the class is not strict safe
12 if (
13 $GLOBALS['HTMLPurifierTest']['PEAR'] &&
14 ((error_reporting() & 2048) != 2048) // ought to be a better way
15 ) {
16 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
17 $this->_has_pear = true;
19 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
20 require_once 'HTMLPurifier/Lexer/PH5P.php';
24 // HTMLPurifier_Lexer::create() --------------------------------------------
26 function test_create() {
27 $this->config->set('Core', 'MaintainLineNumbers', true);
28 $lexer = HTMLPurifier_Lexer::create($this->config);
29 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
32 function test_create_objectLexerImpl() {
33 $this->config->set('Core', 'LexerImpl', new HTMLPurifier_Lexer_DirectLex());
34 $lexer = HTMLPurifier_Lexer::create($this->config);
35 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
38 function test_create_unknownLexer() {
39 $this->config->set('Core', 'LexerImpl', 'AsdfAsdf');
40 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
41 HTMLPurifier_Lexer::create($this->config);
44 function test_create_incompatibleLexer() {
45 $this->config->set('Core', 'LexerImpl', 'DOMLex');
46 $this->config->set('Core', 'MaintainLineNumbers', true);
47 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
48 HTMLPurifier_Lexer::create($this->config);
51 // HTMLPurifier_Lexer->parseData() -----------------------------------------
53 function assertParseData($input, $expect = true) {
54 if ($expect === true) $expect = $input;
55 $lexer = new HTMLPurifier_Lexer();
56 $this->assertIdentical($expect, $lexer->parseData($input));
59 function test_parseData_plainText() {
60 $this->assertParseData('asdf');
63 function test_parseData_ampersandEntity() {
64 $this->assertParseData('&amp;', '&');
67 function test_parseData_quotEntity() {
68 $this->assertParseData('&quot;', '"');
71 function test_parseData_aposNumericEntity() {
72 $this->assertParseData('&#039;', "'");
75 function test_parseData_aposCompactNumericEntity() {
76 $this->assertParseData('&#39;', "'");
79 function test_parseData_adjacentAmpersandEntities() {
80 $this->assertParseData('&amp;&amp;&amp;', '&&&');
83 function test_parseData_trailingUnescapedAmpersand() {
84 $this->assertParseData('&amp;&', '&&');
87 function test_parseData_internalUnescapedAmpersand() {
88 $this->assertParseData('Procter & Gamble');
91 function test_parseData_improperEntityFaultToleranceTest() {
92 $this->assertParseData('&#x2D;');
95 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
97 function assertExtractBody($text, $extract = true) {
98 $lexer = new HTMLPurifier_Lexer();
99 $result = $lexer->extractBody($text);
100 if ($extract === true) $extract = $text;
101 $this->assertIdentical($extract, $result);
104 function test_extractBody_noBodyTags() {
105 $this->assertExtractBody('<b>Bold</b>');
108 function test_extractBody_lowercaseBodyTags() {
109 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
112 function test_extractBody_uppercaseBodyTags() {
113 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
116 function test_extractBody_realisticUseCase() {
117 $this->assertExtractBody(
118 '<?xml version="1.0"
119 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
120 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
121 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
122 <head>
123 <title>xyz</title>
124 </head>
125 <body>
126 <form method="post" action="whatever1">
127 <div>
128 <input type="text" name="username" />
129 <input type="text" name="password" />
130 <input type="submit" />
131 </div>
132 </form>
133 </body>
134 </html>',
136 <form method="post" action="whatever1">
137 <div>
138 <input type="text" name="username" />
139 <input type="text" name="password" />
140 <input type="submit" />
141 </div>
142 </form>
146 function test_extractBody_bodyWithAttributes() {
147 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
150 function test_extractBody_preserveUnclosedBody() {
151 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
154 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
156 function assertTokenization($input, $expect, $alt_expect = array()) {
157 $lexers = array();
158 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
159 if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
160 if (class_exists('DOMDocument')) {
161 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
162 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
164 foreach ($lexers as $name => $lexer) {
165 $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
166 if (isset($alt_expect[$name])) {
167 if ($alt_expect[$name] === false) continue;
168 $t_expect = $alt_expect[$name];
169 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
170 } else {
171 $t_expect = $expect;
172 $this->assertIdentical($result, $expect, "$name: %s");
174 if ($t_expect != $result) {
175 printTokens($result);
176 //var_dump($result);
181 function test_tokenizeHTML_emptyInput() {
182 $this->assertTokenization('', array());
185 function test_tokenizeHTML_plainText() {
186 $this->assertTokenization(
187 'This is regular text.',
188 array(
189 new HTMLPurifier_Token_Text('This is regular text.')
194 function test_tokenizeHTML_textAndTags() {
195 $this->assertTokenization(
196 'This is <b>bold</b> text',
197 array(
198 new HTMLPurifier_Token_Text('This is '),
199 new HTMLPurifier_Token_Start('b', array()),
200 new HTMLPurifier_Token_Text('bold'),
201 new HTMLPurifier_Token_End('b'),
202 new HTMLPurifier_Token_Text(' text'),
207 function test_tokenizeHTML_normalizeCase() {
208 $this->assertTokenization(
209 '<DIV>Totally rad dude. <b>asdf</b></div>',
210 array(
211 new HTMLPurifier_Token_Start('DIV', array()),
212 new HTMLPurifier_Token_Text('Totally rad dude. '),
213 new HTMLPurifier_Token_Start('b', array()),
214 new HTMLPurifier_Token_Text('asdf'),
215 new HTMLPurifier_Token_End('b'),
216 new HTMLPurifier_Token_End('div'),
221 function test_tokenizeHTML_notWellFormed() {
222 $this->assertTokenization(
223 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
224 array(
225 new HTMLPurifier_Token_Start('asdf'),
226 new HTMLPurifier_Token_End('asdf'),
227 new HTMLPurifier_Token_Start('d'),
228 new HTMLPurifier_Token_End('d'),
229 new HTMLPurifier_Token_Start('poOloka'),
230 new HTMLPurifier_Token_Start('poolasdf'),
231 new HTMLPurifier_Token_Start('ds'),
232 new HTMLPurifier_Token_End('asdf'),
233 new HTMLPurifier_Token_End('ASDF'),
235 array(
236 'DOMLex' => $alt = array(
237 new HTMLPurifier_Token_Empty('asdf'),
238 new HTMLPurifier_Token_Empty('d'),
239 new HTMLPurifier_Token_Start('pooloka'),
240 new HTMLPurifier_Token_Start('poolasdf'),
241 new HTMLPurifier_Token_Empty('ds'),
242 new HTMLPurifier_Token_End('poolasdf'),
243 new HTMLPurifier_Token_End('pooloka'),
245 'PH5P' => $alt,
250 function test_tokenizeHTML_whitespaceInTag() {
251 $this->assertTokenization(
252 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
253 array(
254 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
255 new HTMLPurifier_Token_Text('Link to '),
256 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
257 new HTMLPurifier_Token_Text('foobar'),
258 new HTMLPurifier_Token_End('b'),
259 new HTMLPurifier_Token_End('a'),
264 function test_tokenizeHTML_emptyTag() {
265 $this->assertTokenization(
266 '<br />',
267 array( new HTMLPurifier_Token_Empty('br') )
271 function test_tokenizeHTML_comment() {
272 $this->assertTokenization(
273 '<!-- Comment -->',
274 array( new HTMLPurifier_Token_Comment(' Comment ') ),
275 array(
276 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
281 function test_tokenizeHTML_malformedComment() {
282 $this->assertTokenization(
283 '<!-- not so well formed --->',
284 array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
285 array(
286 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
291 function test_tokenizeHTML_unterminatedTag() {
292 $this->assertTokenization(
293 '<a href=""',
294 array( new HTMLPurifier_Token_Text('<a href=""') ),
295 array(
296 // I like our behavior better, but it's non-standard
297 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
298 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
299 'PH5P' => false, // total barfing, grabs scaffolding too
304 function test_tokenizeHTML_specialEntities() {
305 $this->assertTokenization(
306 '&lt;b&gt;',
307 array(
308 new HTMLPurifier_Token_Text('<b>')
310 array(
311 // some parsers will separate entities out
312 'PEARSax3' => $split = array(
313 new HTMLPurifier_Token_Text('<'),
314 new HTMLPurifier_Token_Text('b'),
315 new HTMLPurifier_Token_Text('>'),
317 'PH5P' => $split,
322 function test_tokenizeHTML_earlyQuote() {
323 $this->assertTokenization(
324 '<a "=>',
325 array( new HTMLPurifier_Token_Empty('a') ),
326 array(
327 // we barf on this input
328 'DirectLex' => $tokens = array(
329 new HTMLPurifier_Token_Start('a', array('"' => ''))
331 'PEARSax3' => $tokens,
332 'PH5P' => false, // behavior varies; handle this personally
337 function test_tokenizeHTML_earlyQuote_PH5P() {
338 if (!class_exists('DOMDocument')) return;
339 $lexer = new HTMLPurifier_Lexer_PH5P();
340 $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
341 if ($this->context->get('PH5PError', true)) {
342 $this->assertIdentical(array(
343 new HTMLPurifier_Token_Start('a', array('"' => ''))
344 ), $result);
345 } else {
346 $this->assertIdentical(array(
347 new HTMLPurifier_Token_Empty('a', array('"' => ''))
348 ), $result);
352 function test_tokenizeHTML_unescapedQuote() {
353 $this->assertTokenization(
354 '"',
355 array( new HTMLPurifier_Token_Text('"') )
359 function test_tokenizeHTML_escapedQuote() {
360 $this->assertTokenization(
361 '&quot;',
362 array( new HTMLPurifier_Token_Text('"') ),
363 array(
364 'PEARSax3' => false, // PEAR barfs on this
369 function test_tokenizeHTML_cdata() {
370 $this->assertTokenization(
371 '<![CDATA[You <b>can&#39;t</b> get me!]]>',
372 array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
373 array(
374 // PEAR splits up all of the CDATA
375 'PEARSax3' => $split = array(
376 new HTMLPurifier_Token_Text('You '),
377 new HTMLPurifier_Token_Text('<'),
378 new HTMLPurifier_Token_Text('b'),
379 new HTMLPurifier_Token_Text('>'),
380 new HTMLPurifier_Token_Text('can'),
381 new HTMLPurifier_Token_Text('&'),
382 new HTMLPurifier_Token_Text('#39;t'),
383 new HTMLPurifier_Token_Text('<'),
384 new HTMLPurifier_Token_Text('/b'),
385 new HTMLPurifier_Token_Text('>'),
386 new HTMLPurifier_Token_Text(' get me!'),
388 'PH5P' => $split,
393 function test_tokenizeHTML_characterEntity() {
394 $this->assertTokenization(
395 '&theta;',
396 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
400 function test_tokenizeHTML_characterEntityInCDATA() {
401 $this->assertTokenization(
402 '<![CDATA[&rarr;]]>',
403 array( new HTMLPurifier_Token_Text("&rarr;") ),
404 array(
405 'PEARSax3' => $split = array(
406 new HTMLPurifier_Token_Text('&'),
407 new HTMLPurifier_Token_Text('rarr;'),
409 'PH5P' => $split,
414 function test_tokenizeHTML_entityInAttribute() {
415 $this->assertTokenization(
416 '<a href="index.php?title=foo&amp;id=bar">Link</a>',
417 array(
418 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
419 new HTMLPurifier_Token_Text('Link'),
420 new HTMLPurifier_Token_End('a'),
425 function test_tokenizeHTML_preserveUTF8() {
426 $this->assertTokenization(
427 "\xCE\xB8",
428 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
432 function test_tokenizeHTML_specialEntityInAttribute() {
433 $this->assertTokenization(
434 '<br test="x &lt; 6" />',
435 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
439 function test_tokenizeHTML_emoticonProtection() {
440 $this->assertTokenization(
441 '<b>Whoa! <3 That\'s not good >.></b>',
442 array(
443 new HTMLPurifier_Token_Start('b'),
444 new HTMLPurifier_Token_Text('Whoa! '),
445 new HTMLPurifier_Token_Text('<'),
446 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
447 new HTMLPurifier_Token_End('b')
449 array(
450 // text is absorbed together
451 'DOMLex' => array(
452 new HTMLPurifier_Token_Start('b'),
453 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
454 new HTMLPurifier_Token_End('b'),
456 'PEARSax3' => false, // totally mangled
457 'PH5P' => array( // interesting grouping
458 new HTMLPurifier_Token_Start('b'),
459 new HTMLPurifier_Token_Text('Whoa! '),
460 new HTMLPurifier_Token_Text('<'),
461 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
462 new HTMLPurifier_Token_End('b'),
468 function test_tokenizeHTML_commentWithFunkyChars() {
469 $this->assertTokenization(
470 '<!-- This >< comment --><br />',
471 array(
472 new HTMLPurifier_Token_Comment(' This >< comment '),
473 new HTMLPurifier_Token_Empty('br'),
475 array(
476 'PEARSax3' => false,
481 function test_tokenizeHTML_unterminatedComment() {
482 $this->assertTokenization(
483 '<!-- This >< comment',
484 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
485 array(
486 'DOMLex' => false,
487 'PEARSax3' => false,
488 'PH5P' => false,
493 function test_tokenizeHTML_scriptCDATAContents() {
494 $this->config->set('HTML', 'Trusted', true);
495 $this->assertTokenization(
496 'Foo: <script>alert("<foo>");</script>',
497 array(
498 new HTMLPurifier_Token_Text('Foo: '),
499 new HTMLPurifier_Token_Start('script'),
500 new HTMLPurifier_Token_Text('alert("<foo>");'),
501 new HTMLPurifier_Token_End('script'),
503 array(
504 'PEARSax3' => false,
505 // PH5P, for some reason, bubbles the script to <head>
506 'PH5P' => false,
511 function test_tokenizeHTML_entitiesInComment() {
512 $this->assertTokenization(
513 '<!-- This comment < &lt; & -->',
514 array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
515 array(
516 'PEARSax3' => false
521 function test_tokenizeHTML_attributeWithSpecialCharacters() {
522 $this->assertTokenization(
523 '<a href="><>">',
524 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
525 array(
526 'DirectLex' => array(
527 new HTMLPurifier_Token_Start('a', array('href' => '')),
528 new HTMLPurifier_Token_Text('<'),
529 new HTMLPurifier_Token_Text('">'),
531 'PEARSax3' => false,
536 function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
537 $this->assertTokenization(
538 '<param name="src" value="http://example.com/video.wmv" />',
539 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
543 function test_tokenizeHTML_style() {
544 $extra = array(
545 // PH5P doesn't seem to like style tags
546 'PH5P' => false,
547 // DirectLex defers to RemoveForeignElements for textification
548 'DirectLex' => array(
549 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
550 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
551 new HTMLPurifier_Token_End('style'),
554 if (!defined('LIBXML_VERSION') || LIBXML_VERSION < 20628) {
555 // libxml's behavior is wrong prior to this version, so make
556 // appropriate accomodations
557 // :NOTE: LIBXML_VERSION is missing in early versions of PHP
558 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
559 // this translates to 5.0.x. In such cases, we assume that an old
560 // version of libxml is being used, although that *might* not
561 // be the case (it's very unlikely though)
562 $extra['DOMLex'] = $extra['DirectLex'];
564 $this->assertTokenization(
565 '<style type="text/css"><!--
566 div {}
567 --></style>',
568 array(
569 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
570 new HTMLPurifier_Token_Text("\ndiv {}\n"),
571 new HTMLPurifier_Token_End('style'),
573 $extra
577 function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
578 $this->assertTokenization(
579 '<a@>>',
580 array(
581 new HTMLPurifier_Token_Start('a'),
582 new HTMLPurifier_Token_Text('>'),
583 new HTMLPurifier_Token_End('a'),
585 array(
586 'DirectLex' => array(
587 // Technically this is invalid, but it won't be a
588 // problem with invalid element removal; also, this
589 // mimics Mozilla's parsing of the tag.
590 new HTMLPurifier_Token_Start('a@'),
591 new HTMLPurifier_Token_Text('>'),
597 function test_tokenizeHTML_emoticonHeart() {
598 $this->assertTokenization(
599 '<br /><3<br />',
600 array(
601 new HTMLPurifier_Token_Empty('br'),
602 new HTMLPurifier_Token_Text('<'),
603 new HTMLPurifier_Token_Text('3'),
604 new HTMLPurifier_Token_Empty('br'),
606 array(
607 'DOMLex' => array(
608 new HTMLPurifier_Token_Empty('br'),
609 new HTMLPurifier_Token_Text('<3'),
610 new HTMLPurifier_Token_Empty('br'),
616 function test_tokenizeHTML_emoticonShiftyEyes() {
617 $this->assertTokenization(
618 '<b><<</b>',
619 array(
620 new HTMLPurifier_Token_Start('b'),
621 new HTMLPurifier_Token_Text('<'),
622 new HTMLPurifier_Token_Text('<'),
623 new HTMLPurifier_Token_End('b'),
625 array(
626 'DOMLex' => array(
627 new HTMLPurifier_Token_Start('b'),
628 new HTMLPurifier_Token_Text('<<'),
629 new HTMLPurifier_Token_End('b'),
635 function test_tokenizeHTML_eon1996() {
636 $this->assertTokenization(
637 '< <b>test</b>',
638 array(
639 new HTMLPurifier_Token_Text('<'),
640 new HTMLPurifier_Token_Text(' '),
641 new HTMLPurifier_Token_Start('b'),
642 new HTMLPurifier_Token_Text('test'),
643 new HTMLPurifier_Token_End('b'),
645 array(
646 'DOMLex' => array(
647 new HTMLPurifier_Token_Text('< '),
648 new HTMLPurifier_Token_Start('b'),
649 new HTMLPurifier_Token_Text('test'),
650 new HTMLPurifier_Token_End('b'),
656 function test_tokenizeHTML_bodyInCDATA() {
657 $this->assertTokenization(
658 '<![CDATA[<body>Foo</body>]]>',
659 array(
660 new HTMLPurifier_Token_Text('<body>Foo</body>'),
662 array(
663 'PH5P' => array(
664 new HTMLPurifier_Token_Text('<'),
665 new HTMLPurifier_Token_Text('body'),
666 new HTMLPurifier_Token_Text('>'),
667 new HTMLPurifier_Token_Text('Foo'),
668 new HTMLPurifier_Token_Text('<'),
669 new HTMLPurifier_Token_Text('/body'),
670 new HTMLPurifier_Token_Text('>'),
678 function test_tokenizeHTML_() {
679 $this->assertTokenization(
681 array(
690 // vim: et sw=4 sts=4