Support for inline-block.
[htmlpurifier.git] / tests / HTMLPurifier / LexerTest.php
blob42a59aeb6e4b5f7155db304738d82c7f4efd8587
1 <?php
3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct() {
9 parent::__construct();
10 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
11 require_once 'HTMLPurifier/Lexer/PH5P.php';
15 // HTMLPurifier_Lexer::create() --------------------------------------------
17 function test_create() {
18 $this->config->set('Core.MaintainLineNumbers', true);
19 $lexer = HTMLPurifier_Lexer::create($this->config);
20 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
23 function test_create_objectLexerImpl() {
24 $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
25 $lexer = HTMLPurifier_Lexer::create($this->config);
26 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
29 function test_create_unknownLexer() {
30 $this->config->set('Core.LexerImpl', 'AsdfAsdf');
31 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
32 HTMLPurifier_Lexer::create($this->config);
35 function test_create_incompatibleLexer() {
36 $this->config->set('Core.LexerImpl', 'DOMLex');
37 $this->config->set('Core.MaintainLineNumbers', true);
38 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
39 HTMLPurifier_Lexer::create($this->config);
42 // HTMLPurifier_Lexer->parseData() -----------------------------------------
44 function assertParseData($input, $expect = true) {
45 if ($expect === true) $expect = $input;
46 $lexer = new HTMLPurifier_Lexer();
47 $this->assertIdentical($expect, $lexer->parseData($input));
50 function test_parseData_plainText() {
51 $this->assertParseData('asdf');
54 function test_parseData_ampersandEntity() {
55 $this->assertParseData('&amp;', '&');
58 function test_parseData_quotEntity() {
59 $this->assertParseData('&quot;', '"');
62 function test_parseData_aposNumericEntity() {
63 $this->assertParseData('&#039;', "'");
66 function test_parseData_aposCompactNumericEntity() {
67 $this->assertParseData('&#39;', "'");
70 function test_parseData_adjacentAmpersandEntities() {
71 $this->assertParseData('&amp;&amp;&amp;', '&&&');
74 function test_parseData_trailingUnescapedAmpersand() {
75 $this->assertParseData('&amp;&', '&&');
78 function test_parseData_internalUnescapedAmpersand() {
79 $this->assertParseData('Procter & Gamble');
82 function test_parseData_improperEntityFaultToleranceTest() {
83 $this->assertParseData('&#x2D;');
86 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
88 function assertExtractBody($text, $extract = true) {
89 $lexer = new HTMLPurifier_Lexer();
90 $result = $lexer->extractBody($text);
91 if ($extract === true) $extract = $text;
92 $this->assertIdentical($extract, $result);
95 function test_extractBody_noBodyTags() {
96 $this->assertExtractBody('<b>Bold</b>');
99 function test_extractBody_lowercaseBodyTags() {
100 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
103 function test_extractBody_uppercaseBodyTags() {
104 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
107 function test_extractBody_realisticUseCase() {
108 $this->assertExtractBody(
109 '<?xml version="1.0"
110 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
111 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
112 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
113 <head>
114 <title>xyz</title>
115 </head>
116 <body>
117 <form method="post" action="whatever1">
118 <div>
119 <input type="text" name="username" />
120 <input type="text" name="password" />
121 <input type="submit" />
122 </div>
123 </form>
124 </body>
125 </html>',
127 <form method="post" action="whatever1">
128 <div>
129 <input type="text" name="username" />
130 <input type="text" name="password" />
131 <input type="submit" />
132 </div>
133 </form>
137 function test_extractBody_bodyWithAttributes() {
138 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
141 function test_extractBody_preserveUnclosedBody() {
142 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
145 function test_extractBody_useLastBody() {
146 $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
149 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
151 function assertTokenization($input, $expect, $alt_expect = array()) {
152 $lexers = array();
153 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
154 if (class_exists('DOMDocument')) {
155 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
156 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
158 foreach ($lexers as $name => $lexer) {
159 $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
160 if (isset($alt_expect[$name])) {
161 if ($alt_expect[$name] === false) continue;
162 $t_expect = $alt_expect[$name];
163 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
164 } else {
165 $t_expect = $expect;
166 $this->assertIdentical($result, $expect, "$name: %s");
168 if ($t_expect != $result) {
169 printTokens($result);
174 function test_tokenizeHTML_emptyInput() {
175 $this->assertTokenization('', array());
178 function test_tokenizeHTML_plainText() {
179 $this->assertTokenization(
180 'This is regular text.',
181 array(
182 new HTMLPurifier_Token_Text('This is regular text.')
187 function test_tokenizeHTML_textAndTags() {
188 $this->assertTokenization(
189 'This is <b>bold</b> text',
190 array(
191 new HTMLPurifier_Token_Text('This is '),
192 new HTMLPurifier_Token_Start('b', array()),
193 new HTMLPurifier_Token_Text('bold'),
194 new HTMLPurifier_Token_End('b'),
195 new HTMLPurifier_Token_Text(' text'),
200 function test_tokenizeHTML_normalizeCase() {
201 $this->assertTokenization(
202 '<DIV>Totally rad dude. <b>asdf</b></div>',
203 array(
204 new HTMLPurifier_Token_Start('DIV', array()),
205 new HTMLPurifier_Token_Text('Totally rad dude. '),
206 new HTMLPurifier_Token_Start('b', array()),
207 new HTMLPurifier_Token_Text('asdf'),
208 new HTMLPurifier_Token_End('b'),
209 new HTMLPurifier_Token_End('div'),
214 function test_tokenizeHTML_notWellFormed() {
215 $this->assertTokenization(
216 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
217 array(
218 new HTMLPurifier_Token_Start('asdf'),
219 new HTMLPurifier_Token_End('asdf'),
220 new HTMLPurifier_Token_Start('d'),
221 new HTMLPurifier_Token_End('d'),
222 new HTMLPurifier_Token_Start('poOloka'),
223 new HTMLPurifier_Token_Start('poolasdf'),
224 new HTMLPurifier_Token_Start('ds'),
225 new HTMLPurifier_Token_End('asdf'),
226 new HTMLPurifier_Token_End('ASDF'),
228 array(
229 'DOMLex' => $alt = array(
230 new HTMLPurifier_Token_Empty('asdf'),
231 new HTMLPurifier_Token_Empty('d'),
232 new HTMLPurifier_Token_Start('pooloka'),
233 new HTMLPurifier_Token_Start('poolasdf'),
234 new HTMLPurifier_Token_Empty('ds'),
235 new HTMLPurifier_Token_End('poolasdf'),
236 new HTMLPurifier_Token_End('pooloka'),
238 'PH5P' => $alt,
243 function test_tokenizeHTML_whitespaceInTag() {
244 $this->assertTokenization(
245 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
246 array(
247 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
248 new HTMLPurifier_Token_Text('Link to '),
249 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
250 new HTMLPurifier_Token_Text('foobar'),
251 new HTMLPurifier_Token_End('b'),
252 new HTMLPurifier_Token_End('a'),
257 function test_tokenizeHTML_singleAttribute() {
258 $this->assertTokenization(
259 '<br style="&amp;" />',
260 array(
261 new HTMLPurifier_Token_Empty('br', array('style' => '&'))
266 function test_tokenizeHTML_emptyTag() {
267 $this->assertTokenization(
268 '<br />',
269 array( new HTMLPurifier_Token_Empty('br') )
273 function test_tokenizeHTML_comment() {
274 $this->assertTokenization(
275 '<!-- Comment -->',
276 array( new HTMLPurifier_Token_Comment(' Comment ') )
280 function test_tokenizeHTML_malformedComment() {
281 $this->assertTokenization(
282 '<!-- not so well formed --->',
283 array( new HTMLPurifier_Token_Comment(' not so well formed -') )
287 function test_tokenizeHTML_unterminatedTag() {
288 $this->assertTokenization(
289 '<a href=""',
290 array( new HTMLPurifier_Token_Text('<a href=""') ),
291 array(
292 // I like our behavior better, but it's non-standard
293 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
294 'PH5P' => false, // total barfing, grabs scaffolding too
299 function test_tokenizeHTML_specialEntities() {
300 $this->assertTokenization(
301 '&lt;b&gt;',
302 array(
303 new HTMLPurifier_Token_Text('<b>')
305 array(
306 // some parsers will separate entities out
307 'PH5P' => array(
308 new HTMLPurifier_Token_Text('<'),
309 new HTMLPurifier_Token_Text('b'),
310 new HTMLPurifier_Token_Text('>'),
316 function test_tokenizeHTML_earlyQuote() {
317 $this->assertTokenization(
318 '<a "=>',
319 array( new HTMLPurifier_Token_Empty('a') ),
320 array(
321 // we barf on this input
322 'DirectLex' => array(
323 new HTMLPurifier_Token_Start('a', array('"' => ''))
325 'PH5P' => false, // behavior varies; handle this personally
330 function test_tokenizeHTML_earlyQuote_PH5P() {
331 if (!class_exists('DOMDocument')) return;
332 $lexer = new HTMLPurifier_Lexer_PH5P();
333 $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
334 if ($this->context->get('PH5PError', true)) {
335 $this->assertIdentical(array(
336 new HTMLPurifier_Token_Start('a', array('"' => ''))
337 ), $result);
338 } else {
339 $this->assertIdentical(array(
340 new HTMLPurifier_Token_Empty('a', array('"' => ''))
341 ), $result);
345 function test_tokenizeHTML_unescapedQuote() {
346 $this->assertTokenization(
347 '"',
348 array( new HTMLPurifier_Token_Text('"') )
352 function test_tokenizeHTML_escapedQuote() {
353 $this->assertTokenization(
354 '&quot;',
355 array( new HTMLPurifier_Token_Text('"') )
359 function test_tokenizeHTML_cdata() {
360 $this->assertTokenization(
361 '<![CDATA[You <b>can&#39;t</b> get me!]]>',
362 array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
363 array(
364 'PH5P' => array(
365 new HTMLPurifier_Token_Text('You '),
366 new HTMLPurifier_Token_Text('<'),
367 new HTMLPurifier_Token_Text('b'),
368 new HTMLPurifier_Token_Text('>'),
369 new HTMLPurifier_Token_Text('can'),
370 new HTMLPurifier_Token_Text('&'),
371 new HTMLPurifier_Token_Text('#39;t'),
372 new HTMLPurifier_Token_Text('<'),
373 new HTMLPurifier_Token_Text('/b'),
374 new HTMLPurifier_Token_Text('>'),
375 new HTMLPurifier_Token_Text(' get me!'),
381 function test_tokenizeHTML_characterEntity() {
382 $this->assertTokenization(
383 '&theta;',
384 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
388 function test_tokenizeHTML_characterEntityInCDATA() {
389 $this->assertTokenization(
390 '<![CDATA[&rarr;]]>',
391 array( new HTMLPurifier_Token_Text("&rarr;") ),
392 array(
393 'PH5P' => array(
394 new HTMLPurifier_Token_Text('&'),
395 new HTMLPurifier_Token_Text('rarr;'),
401 function test_tokenizeHTML_entityInAttribute() {
402 $this->assertTokenization(
403 '<a href="index.php?title=foo&amp;id=bar">Link</a>',
404 array(
405 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
406 new HTMLPurifier_Token_Text('Link'),
407 new HTMLPurifier_Token_End('a'),
412 function test_tokenizeHTML_preserveUTF8() {
413 $this->assertTokenization(
414 "\xCE\xB8",
415 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
419 function test_tokenizeHTML_specialEntityInAttribute() {
420 $this->assertTokenization(
421 '<br test="x &lt; 6" />',
422 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
426 function test_tokenizeHTML_emoticonProtection() {
427 $this->assertTokenization(
428 '<b>Whoa! <3 That\'s not good >.></b>',
429 array(
430 new HTMLPurifier_Token_Start('b'),
431 new HTMLPurifier_Token_Text('Whoa! '),
432 new HTMLPurifier_Token_Text('<'),
433 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
434 new HTMLPurifier_Token_End('b')
436 array(
437 // text is absorbed together
438 'DOMLex' => array(
439 new HTMLPurifier_Token_Start('b'),
440 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
441 new HTMLPurifier_Token_End('b'),
443 'PH5P' => array( // interesting grouping
444 new HTMLPurifier_Token_Start('b'),
445 new HTMLPurifier_Token_Text('Whoa! '),
446 new HTMLPurifier_Token_Text('<'),
447 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
448 new HTMLPurifier_Token_End('b'),
454 function test_tokenizeHTML_commentWithFunkyChars() {
455 $this->assertTokenization(
456 '<!-- This >< comment --><br />',
457 array(
458 new HTMLPurifier_Token_Comment(' This >< comment '),
459 new HTMLPurifier_Token_Empty('br'),
464 function test_tokenizeHTML_unterminatedComment() {
465 $this->assertTokenization(
466 '<!-- This >< comment',
467 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
468 array(
469 'DOMLex' => false,
470 'PH5P' => false,
475 function test_tokenizeHTML_scriptCDATAContents() {
476 $this->config->set('HTML.Trusted', true);
477 $this->assertTokenization(
478 'Foo: <script>alert("<foo>");</script>',
479 array(
480 new HTMLPurifier_Token_Text('Foo: '),
481 new HTMLPurifier_Token_Start('script'),
482 new HTMLPurifier_Token_Text('alert("<foo>");'),
483 new HTMLPurifier_Token_End('script'),
485 array(
486 // PH5P, for some reason, bubbles the script to <head>
487 'PH5P' => false,
492 function test_tokenizeHTML_entitiesInComment() {
493 $this->assertTokenization(
494 '<!-- This comment < &lt; & -->',
495 array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
499 function test_tokenizeHTML_attributeWithSpecialCharacters() {
500 $this->assertTokenization(
501 '<a href="><>">',
502 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
503 array(
504 'DirectLex' => array(
505 new HTMLPurifier_Token_Start('a', array('href' => '')),
506 new HTMLPurifier_Token_Text('<'),
507 new HTMLPurifier_Token_Text('">'),
513 function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
514 $this->assertTokenization(
515 '<param name="src" value="http://example.com/video.wmv" />',
516 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
520 function test_tokenizeHTML_style() {
521 $extra = array(
522 // PH5P doesn't seem to like style tags
523 'PH5P' => false,
524 // DirectLex defers to RemoveForeignElements for textification
525 'DirectLex' => array(
526 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
527 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
528 new HTMLPurifier_Token_End('style'),
531 if (!defined('LIBXML_VERSION')) {
532 // LIBXML_VERSION is missing in early versions of PHP
533 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
534 // this translates to 5.0.x. In such cases, punt the test entirely.
535 return;
536 } elseif (LIBXML_VERSION < 20628) {
537 // libxml's behavior is wrong prior to this version, so make
538 // appropriate accomodations
539 $extra['DOMLex'] = $extra['DirectLex'];
541 $this->assertTokenization(
542 '<style type="text/css"><!--
543 div {}
544 --></style>',
545 array(
546 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
547 new HTMLPurifier_Token_Text("\ndiv {}\n"),
548 new HTMLPurifier_Token_End('style'),
550 $extra
554 function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
555 $alt_expect = array(
556 // Technically this is invalid, but it won't be a
557 // problem with invalid element removal; also, this
558 // mimics Mozilla's parsing of the tag.
559 new HTMLPurifier_Token_Start('a@'),
560 new HTMLPurifier_Token_Text('>'),
562 $this->assertTokenization(
563 '<a@>>',
564 array(
565 new HTMLPurifier_Token_Start('a'),
566 new HTMLPurifier_Token_Text('>'),
567 new HTMLPurifier_Token_End('a'),
569 array(
570 'DirectLex' => $alt_expect,
575 function test_tokenizeHTML_emoticonHeart() {
576 $this->assertTokenization(
577 '<br /><3<br />',
578 array(
579 new HTMLPurifier_Token_Empty('br'),
580 new HTMLPurifier_Token_Text('<'),
581 new HTMLPurifier_Token_Text('3'),
582 new HTMLPurifier_Token_Empty('br'),
584 array(
585 'DOMLex' => array(
586 new HTMLPurifier_Token_Empty('br'),
587 new HTMLPurifier_Token_Text('<3'),
588 new HTMLPurifier_Token_Empty('br'),
594 function test_tokenizeHTML_emoticonShiftyEyes() {
595 $this->assertTokenization(
596 '<b><<</b>',
597 array(
598 new HTMLPurifier_Token_Start('b'),
599 new HTMLPurifier_Token_Text('<'),
600 new HTMLPurifier_Token_Text('<'),
601 new HTMLPurifier_Token_End('b'),
603 array(
604 'DOMLex' => array(
605 new HTMLPurifier_Token_Start('b'),
606 new HTMLPurifier_Token_Text('<<'),
607 new HTMLPurifier_Token_End('b'),
613 function test_tokenizeHTML_eon1996() {
614 $this->assertTokenization(
615 '< <b>test</b>',
616 array(
617 new HTMLPurifier_Token_Text('<'),
618 new HTMLPurifier_Token_Text(' '),
619 new HTMLPurifier_Token_Start('b'),
620 new HTMLPurifier_Token_Text('test'),
621 new HTMLPurifier_Token_End('b'),
623 array(
624 'DOMLex' => array(
625 new HTMLPurifier_Token_Text('< '),
626 new HTMLPurifier_Token_Start('b'),
627 new HTMLPurifier_Token_Text('test'),
628 new HTMLPurifier_Token_End('b'),
634 function test_tokenizeHTML_bodyInCDATA() {
635 $alt_tokens = array(
636 new HTMLPurifier_Token_Text('<'),
637 new HTMLPurifier_Token_Text('body'),
638 new HTMLPurifier_Token_Text('>'),
639 new HTMLPurifier_Token_Text('Foo'),
640 new HTMLPurifier_Token_Text('<'),
641 new HTMLPurifier_Token_Text('/body'),
642 new HTMLPurifier_Token_Text('>'),
644 $this->assertTokenization(
645 '<![CDATA[<body>Foo</body>]]>',
646 array(
647 new HTMLPurifier_Token_Text('<body>Foo</body>'),
649 array(
650 'PH5P' => $alt_tokens,
655 function test_tokenizeHTML_() {
656 $this->assertTokenization(
657 '<a><img /></a>',
658 array(
659 new HTMLPurifier_Token_Start('a'),
660 new HTMLPurifier_Token_Empty('img'),
661 new HTMLPurifier_Token_End('a'),
666 function test_tokenizeHTML_ignoreIECondComment() {
667 $this->assertTokenization(
668 '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
669 array()
673 function test_tokenizeHTML_removeProcessingInstruction() {
674 $this->config->set('Core.RemoveProcessingInstructions', true);
675 $this->assertTokenization(
676 '<?xml blah blah ?>',
677 array()
681 function test_tokenizeHTML_removeNewline() {
682 $this->config->set('Core.NormalizeNewlines', true);
683 $this->assertTokenization(
684 "plain\rtext\r\n",
685 array(
686 new HTMLPurifier_Token_Text("plain\ntext\n")
691 function test_tokenizeHTML_noRemoveNewline() {
692 $this->config->set('Core.NormalizeNewlines', false);
693 $this->assertTokenization(
694 "plain\rtext\r\n",
695 array(
696 new HTMLPurifier_Token_Text("plain\rtext\r\n")
701 function test_tokenizeHTML_conditionalCommentUngreedy() {
702 $this->assertTokenization(
703 '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
704 array(
705 new HTMLPurifier_Token_Text("b")
710 function test_tokenizeHTML_imgTag() {
711 $start = array(
712 new HTMLPurifier_Token_Start('img',
713 array(
714 'src' => 'img_11775.jpg',
715 'alt' => '[Img #11775]',
716 'id' => 'EMBEDDED_IMG_11775',
720 $this->assertTokenization(
721 '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
722 array(
723 new HTMLPurifier_Token_Empty('img',
724 array(
725 'src' => 'img_11775.jpg',
726 'alt' => '[Img #11775]',
727 'id' => 'EMBEDDED_IMG_11775',
731 array(
732 'DirectLex' => $start,
740 function test_tokenizeHTML_() {
741 $this->assertTokenization(
743 array(
752 // vim: et sw=4 sts=4