Don't truncate in DOMLex when seeing closing div
[htmlpurifier.git] / tests / HTMLPurifier / LexerTest.php
blobecdbe1b8dc341b7023c7de8d8263c48e9ec45a46
1 <?php
3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct()
10 parent::__construct();
11 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
12 require_once 'HTMLPurifier/Lexer/PH5P.php';
16 // HTMLPurifier_Lexer::create() --------------------------------------------
18 public function test_create()
20 $this->config->set('Core.MaintainLineNumbers', true);
21 $lexer = HTMLPurifier_Lexer::create($this->config);
22 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
25 public function test_create_objectLexerImpl()
27 $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
28 $lexer = HTMLPurifier_Lexer::create($this->config);
29 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
32 public function test_create_unknownLexer()
34 $this->config->set('Core.LexerImpl', 'AsdfAsdf');
35 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
36 HTMLPurifier_Lexer::create($this->config);
39 public function test_create_incompatibleLexer()
41 $this->config->set('Core.LexerImpl', 'DOMLex');
42 $this->config->set('Core.MaintainLineNumbers', true);
43 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
44 HTMLPurifier_Lexer::create($this->config);
47 // HTMLPurifier_Lexer->parseData() -----------------------------------------
49 public function assertParseData($input, $expect = true)
51 if ($expect === true) $expect = $input;
52 $lexer = new HTMLPurifier_Lexer();
53 $this->assertIdentical($expect, $lexer->parseData($input));
56 public function test_parseData_plainText()
58 $this->assertParseData('asdf');
61 public function test_parseData_ampersandEntity()
63 $this->assertParseData('&amp;', '&');
66 public function test_parseData_quotEntity()
68 $this->assertParseData('&quot;', '"');
71 public function test_parseData_aposNumericEntity()
73 $this->assertParseData('&#039;', "'");
76 public function test_parseData_aposCompactNumericEntity()
78 $this->assertParseData('&#39;', "'");
81 public function test_parseData_adjacentAmpersandEntities()
83 $this->assertParseData('&amp;&amp;&amp;', '&&&');
86 public function test_parseData_trailingUnescapedAmpersand()
88 $this->assertParseData('&amp;&', '&&');
91 public function test_parseData_internalUnescapedAmpersand()
93 $this->assertParseData('Procter & Gamble');
96 public function test_parseData_improperEntityFaultToleranceTest()
98 $this->assertParseData('&#x2D;');
101 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
103 public function assertExtractBody($text, $extract = true)
105 $lexer = new HTMLPurifier_Lexer();
106 $result = $lexer->extractBody($text);
107 if ($extract === true) $extract = $text;
108 $this->assertIdentical($extract, $result);
111 public function test_extractBody_noBodyTags()
113 $this->assertExtractBody('<b>Bold</b>');
116 public function test_extractBody_lowercaseBodyTags()
118 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
121 public function test_extractBody_uppercaseBodyTags()
123 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
126 public function test_extractBody_realisticUseCase()
128 $this->assertExtractBody(
129 '<?xml version="1.0"
130 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
131 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
132 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
133 <head>
134 <title>xyz</title>
135 </head>
136 <body>
137 <form method="post" action="whatever1">
138 <div>
139 <input type="text" name="username" />
140 <input type="text" name="password" />
141 <input type="submit" />
142 </div>
143 </form>
144 </body>
145 </html>',
147 <form method="post" action="whatever1">
148 <div>
149 <input type="text" name="username" />
150 <input type="text" name="password" />
151 <input type="submit" />
152 </div>
153 </form>
157 public function test_extractBody_bodyWithAttributes()
159 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
162 public function test_extractBody_preserveUnclosedBody()
164 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
167 public function test_extractBody_useLastBody()
169 $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
172 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
174 public function assertTokenization($input, $expect, $alt_expect = array())
176 $lexers = array();
177 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
178 if (class_exists('DOMDocument')) {
179 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
180 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
182 foreach ($lexers as $name => $lexer) {
183 $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
184 if (isset($alt_expect[$name])) {
185 if ($alt_expect[$name] === false) continue;
186 $t_expect = $alt_expect[$name];
187 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
188 } else {
189 $t_expect = $expect;
190 $this->assertIdentical($result, $expect, "$name: %s");
192 if ($t_expect != $result) {
193 printTokens($result);
198 public function test_tokenizeHTML_emptyInput()
200 $this->assertTokenization('', array());
203 public function test_tokenizeHTML_plainText()
205 $this->assertTokenization(
206 'This is regular text.',
207 array(
208 new HTMLPurifier_Token_Text('This is regular text.')
213 public function test_tokenizeHTML_textAndTags()
215 $this->assertTokenization(
216 'This is <b>bold</b> text',
217 array(
218 new HTMLPurifier_Token_Text('This is '),
219 new HTMLPurifier_Token_Start('b', array()),
220 new HTMLPurifier_Token_Text('bold'),
221 new HTMLPurifier_Token_End('b'),
222 new HTMLPurifier_Token_Text(' text'),
227 public function test_tokenizeHTML_normalizeCase()
229 $this->assertTokenization(
230 '<DIV>Totally rad dude. <b>asdf</b></div>',
231 array(
232 new HTMLPurifier_Token_Start('DIV', array()),
233 new HTMLPurifier_Token_Text('Totally rad dude. '),
234 new HTMLPurifier_Token_Start('b', array()),
235 new HTMLPurifier_Token_Text('asdf'),
236 new HTMLPurifier_Token_End('b'),
237 new HTMLPurifier_Token_End('div'),
242 public function test_tokenizeHTML_notWellFormed()
244 $this->assertTokenization(
245 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
246 array(
247 new HTMLPurifier_Token_Start('asdf'),
248 new HTMLPurifier_Token_End('asdf'),
249 new HTMLPurifier_Token_Start('d'),
250 new HTMLPurifier_Token_End('d'),
251 new HTMLPurifier_Token_Start('poOloka'),
252 new HTMLPurifier_Token_Start('poolasdf'),
253 new HTMLPurifier_Token_Start('ds'),
254 new HTMLPurifier_Token_End('asdf'),
255 new HTMLPurifier_Token_End('ASDF'),
257 array(
258 'DOMLex' => $alt = array(
259 new HTMLPurifier_Token_Empty('asdf'),
260 new HTMLPurifier_Token_Empty('d'),
261 new HTMLPurifier_Token_Start('pooloka'),
262 new HTMLPurifier_Token_Start('poolasdf'),
263 new HTMLPurifier_Token_Empty('ds'),
264 new HTMLPurifier_Token_End('poolasdf'),
265 new HTMLPurifier_Token_End('pooloka'),
267 // 20140831: Weird, but whatever...
268 'PH5P' => array(new HTMLPurifier_Token_Empty('asdf')),
273 public function test_tokenizeHTML_whitespaceInTag()
275 $this->assertTokenization(
276 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
277 array(
278 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
279 new HTMLPurifier_Token_Text('Link to '),
280 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
281 new HTMLPurifier_Token_Text('foobar'),
282 new HTMLPurifier_Token_End('b'),
283 new HTMLPurifier_Token_End('a'),
288 public function test_tokenizeHTML_singleAttribute()
290 $this->assertTokenization(
291 '<br style="&amp;" />',
292 array(
293 new HTMLPurifier_Token_Empty('br', array('style' => '&'))
298 public function test_tokenizeHTML_emptyTag()
300 $this->assertTokenization(
301 '<br />',
302 array( new HTMLPurifier_Token_Empty('br') )
306 public function test_tokenizeHTML_comment()
308 $this->assertTokenization(
309 '<!-- Comment -->',
310 array( new HTMLPurifier_Token_Comment(' Comment ') )
314 public function test_tokenizeHTML_malformedComment()
316 $this->assertTokenization(
317 '<!-- not so well formed --->',
318 array( new HTMLPurifier_Token_Comment(' not so well formed -') )
322 public function test_tokenizeHTML_unterminatedTag()
324 $this->assertTokenization(
325 '<a href=""',
326 array( new HTMLPurifier_Token_Text('<a href=""') ),
327 array(
328 // I like our behavior better, but it's non-standard
329 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
330 'PH5P' => false, // total barfing, grabs scaffolding too
335 public function test_tokenizeHTML_specialEntities()
337 $this->assertTokenization(
338 '&lt;b&gt;',
339 array(
340 new HTMLPurifier_Token_Text('<b>')
342 array(
343 // some parsers will separate entities out
344 'PH5P' => array(
345 new HTMLPurifier_Token_Text('<'),
346 new HTMLPurifier_Token_Text('b'),
347 new HTMLPurifier_Token_Text('>'),
353 public function test_tokenizeHTML_earlyQuote()
355 $this->assertTokenization(
356 '<a "=>',
357 array( new HTMLPurifier_Token_Empty('a') ),
358 array(
359 // we barf on this input
360 'DirectLex' => array(
361 new HTMLPurifier_Token_Start('a', array('"' => ''))
363 'PH5P' => false, // behavior varies; handle this personally
368 public function test_tokenizeHTML_earlyQuote_PH5P()
370 if (!class_exists('DOMDocument')) return;
371 $lexer = new HTMLPurifier_Lexer_PH5P();
372 $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
373 if ($this->context->get('PH5PError', true)) {
374 $this->assertIdentical(array(
375 new HTMLPurifier_Token_Start('a', array('"' => ''))
376 ), $result);
377 } else {
378 $this->assertIdentical(array(
379 new HTMLPurifier_Token_Empty('a', array('"' => ''))
380 ), $result);
384 public function test_tokenizeHTML_unescapedQuote()
386 $this->assertTokenization(
387 '"',
388 array( new HTMLPurifier_Token_Text('"') )
392 public function test_tokenizeHTML_escapedQuote()
394 $this->assertTokenization(
395 '&quot;',
396 array( new HTMLPurifier_Token_Text('"') )
400 public function test_tokenizeHTML_cdata()
402 $this->assertTokenization(
403 '<![CDATA[You <b>can&#39;t</b> get me!]]>',
404 array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
405 array(
406 'PH5P' => array(
407 new HTMLPurifier_Token_Text('You '),
408 new HTMLPurifier_Token_Text('<'),
409 new HTMLPurifier_Token_Text('b'),
410 new HTMLPurifier_Token_Text('>'),
411 new HTMLPurifier_Token_Text('can'),
412 new HTMLPurifier_Token_Text('&'),
413 new HTMLPurifier_Token_Text('#39;t'),
414 new HTMLPurifier_Token_Text('<'),
415 new HTMLPurifier_Token_Text('/b'),
416 new HTMLPurifier_Token_Text('>'),
417 new HTMLPurifier_Token_Text(' get me!'),
423 public function test_tokenizeHTML_characterEntity()
425 $this->assertTokenization(
426 '&theta;',
427 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
431 public function test_tokenizeHTML_characterEntityInCDATA()
433 $this->assertTokenization(
434 '<![CDATA[&rarr;]]>',
435 array( new HTMLPurifier_Token_Text("&rarr;") ),
436 array(
437 'PH5P' => array(
438 new HTMLPurifier_Token_Text('&'),
439 new HTMLPurifier_Token_Text('rarr;'),
445 public function test_tokenizeHTML_entityInAttribute()
447 $this->assertTokenization(
448 '<a href="index.php?title=foo&amp;id=bar">Link</a>',
449 array(
450 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
451 new HTMLPurifier_Token_Text('Link'),
452 new HTMLPurifier_Token_End('a'),
457 public function test_tokenizeHTML_preserveUTF8()
459 $this->assertTokenization(
460 "\xCE\xB8",
461 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
465 public function test_tokenizeHTML_specialEntityInAttribute()
467 $this->assertTokenization(
468 '<br test="x &lt; 6" />',
469 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
473 public function test_tokenizeHTML_emoticonProtection()
475 $this->assertTokenization(
476 '<b>Whoa! <3 That\'s not good >.></b>',
477 array(
478 new HTMLPurifier_Token_Start('b'),
479 new HTMLPurifier_Token_Text('Whoa! '),
480 new HTMLPurifier_Token_Text('<'),
481 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
482 new HTMLPurifier_Token_End('b')
484 array(
485 // text is absorbed together
486 'DOMLex' => array(
487 new HTMLPurifier_Token_Start('b'),
488 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
489 new HTMLPurifier_Token_End('b'),
491 'PH5P' => array( // interesting grouping
492 new HTMLPurifier_Token_Start('b'),
493 new HTMLPurifier_Token_Text('Whoa! '),
494 new HTMLPurifier_Token_Text('<'),
495 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
496 new HTMLPurifier_Token_End('b'),
502 public function test_tokenizeHTML_commentWithFunkyChars()
504 $this->assertTokenization(
505 '<!-- This >< comment --><br />',
506 array(
507 new HTMLPurifier_Token_Comment(' This >< comment '),
508 new HTMLPurifier_Token_Empty('br'),
513 public function test_tokenizeHTML_unterminatedComment()
515 $this->assertTokenization(
516 '<!-- This >< comment',
517 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
518 array(
519 'DOMLex' => false,
520 'PH5P' => false,
525 public function test_tokenizeHTML_scriptCDATAContents()
527 $this->config->set('HTML.Trusted', true);
528 $this->assertTokenization(
529 'Foo: <script>alert("<foo>");</script>',
530 array(
531 new HTMLPurifier_Token_Text('Foo: '),
532 new HTMLPurifier_Token_Start('script'),
533 new HTMLPurifier_Token_Text('alert("<foo>");'),
534 new HTMLPurifier_Token_End('script'),
536 array(
537 // PH5P, for some reason, bubbles the script to <head>
538 'PH5P' => false,
543 public function test_tokenizeHTML_entitiesInComment()
545 $this->assertTokenization(
546 '<!-- This comment < &lt; & -->',
547 array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
551 public function test_tokenizeHTML_attributeWithSpecialCharacters()
553 $this->assertTokenization(
554 '<a href="><>">',
555 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
556 array(
557 'DirectLex' => array(
558 new HTMLPurifier_Token_Start('a', array('href' => '')),
559 new HTMLPurifier_Token_Text('<'),
560 new HTMLPurifier_Token_Text('">'),
566 public function test_tokenizeHTML_emptyTagWithSlashInAttribute()
568 $this->assertTokenization(
569 '<param name="src" value="http://example.com/video.wmv" />',
570 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
574 public function test_tokenizeHTML_style()
576 $extra = array(
577 // PH5P doesn't seem to like style tags
578 'PH5P' => false,
579 // DirectLex defers to RemoveForeignElements for textification
580 'DirectLex' => array(
581 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
582 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
583 new HTMLPurifier_Token_End('style'),
586 if (!defined('LIBXML_VERSION')) {
587 // LIBXML_VERSION is missing in early versions of PHP
588 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
589 // this translates to 5.0.x. In such cases, punt the test entirely.
590 return;
591 } elseif (LIBXML_VERSION < 20628) {
592 // libxml's behavior is wrong prior to this version, so make
593 // appropriate accomodations
594 $extra['DOMLex'] = $extra['DirectLex'];
596 $this->assertTokenization(
597 '<style type="text/css"><!--
598 div {}
599 --></style>',
600 array(
601 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
602 new HTMLPurifier_Token_Text("\ndiv {}\n"),
603 new HTMLPurifier_Token_End('style'),
605 $extra
609 public function test_tokenizeHTML_tagWithAtSignAndExtraGt()
611 $alt_expect = array(
612 // Technically this is invalid, but it won't be a
613 // problem with invalid element removal; also, this
614 // mimics Mozilla's parsing of the tag.
615 new HTMLPurifier_Token_Start('a@'),
616 new HTMLPurifier_Token_Text('>'),
618 $this->assertTokenization(
619 '<a@>>',
620 array(
621 new HTMLPurifier_Token_Start('a'),
622 new HTMLPurifier_Token_Text('>'),
623 new HTMLPurifier_Token_End('a'),
625 array(
626 'DirectLex' => $alt_expect,
631 public function test_tokenizeHTML_emoticonHeart()
633 $this->assertTokenization(
634 '<br /><3<br />',
635 array(
636 new HTMLPurifier_Token_Empty('br'),
637 new HTMLPurifier_Token_Text('<'),
638 new HTMLPurifier_Token_Text('3'),
639 new HTMLPurifier_Token_Empty('br'),
641 array(
642 'DOMLex' => array(
643 new HTMLPurifier_Token_Empty('br'),
644 new HTMLPurifier_Token_Text('<3'),
645 new HTMLPurifier_Token_Empty('br'),
651 public function test_tokenizeHTML_emoticonShiftyEyes()
653 $this->assertTokenization(
654 '<b><<</b>',
655 array(
656 new HTMLPurifier_Token_Start('b'),
657 new HTMLPurifier_Token_Text('<'),
658 new HTMLPurifier_Token_Text('<'),
659 new HTMLPurifier_Token_End('b'),
661 array(
662 'DOMLex' => array(
663 new HTMLPurifier_Token_Start('b'),
664 new HTMLPurifier_Token_Text('<<'),
665 new HTMLPurifier_Token_End('b'),
671 public function test_tokenizeHTML_eon1996()
673 $this->assertTokenization(
674 '< <b>test</b>',
675 array(
676 new HTMLPurifier_Token_Text('<'),
677 new HTMLPurifier_Token_Text(' '),
678 new HTMLPurifier_Token_Start('b'),
679 new HTMLPurifier_Token_Text('test'),
680 new HTMLPurifier_Token_End('b'),
682 array(
683 'DOMLex' => array(
684 new HTMLPurifier_Token_Text('< '),
685 new HTMLPurifier_Token_Start('b'),
686 new HTMLPurifier_Token_Text('test'),
687 new HTMLPurifier_Token_End('b'),
693 public function test_tokenizeHTML_bodyInCDATA()
695 $alt_tokens = array(
696 new HTMLPurifier_Token_Text('<'),
697 new HTMLPurifier_Token_Text('body'),
698 new HTMLPurifier_Token_Text('>'),
699 new HTMLPurifier_Token_Text('Foo'),
700 new HTMLPurifier_Token_Text('<'),
701 new HTMLPurifier_Token_Text('/body'),
702 new HTMLPurifier_Token_Text('>'),
704 $this->assertTokenization(
705 '<![CDATA[<body>Foo</body>]]>',
706 array(
707 new HTMLPurifier_Token_Text('<body>Foo</body>'),
709 array(
710 'PH5P' => $alt_tokens,
715 public function test_tokenizeHTML_()
717 $this->assertTokenization(
718 '<a><img /></a>',
719 array(
720 new HTMLPurifier_Token_Start('a'),
721 new HTMLPurifier_Token_Empty('img'),
722 new HTMLPurifier_Token_End('a'),
727 public function test_tokenizeHTML_ignoreIECondComment()
729 $this->assertTokenization(
730 '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
731 array()
735 public function test_tokenizeHTML_removeProcessingInstruction()
737 $this->config->set('Core.RemoveProcessingInstructions', true);
738 $this->assertTokenization(
739 '<?xml blah blah ?>',
740 array()
744 public function test_tokenizeHTML_removeNewline()
746 $this->config->set('Core.NormalizeNewlines', true);
747 $this->assertTokenization(
748 "plain\rtext\r\n",
749 array(
750 new HTMLPurifier_Token_Text("plain\ntext\n")
755 public function test_tokenizeHTML_noRemoveNewline()
757 $this->config->set('Core.NormalizeNewlines', false);
758 $this->assertTokenization(
759 "plain\rtext\r\n",
760 array(
761 new HTMLPurifier_Token_Text("plain\rtext\r\n")
766 public function test_tokenizeHTML_conditionalCommentUngreedy()
768 $this->assertTokenization(
769 '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
770 array(
771 new HTMLPurifier_Token_Text("b")
776 public function test_tokenizeHTML_imgTag()
778 $start = array(
779 new HTMLPurifier_Token_Start('img',
780 array(
781 'src' => 'img_11775.jpg',
782 'alt' => '[Img #11775]',
783 'id' => 'EMBEDDED_IMG_11775',
787 $this->assertTokenization(
788 '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
789 array(
790 new HTMLPurifier_Token_Empty('img',
791 array(
792 'src' => 'img_11775.jpg',
793 'alt' => '[Img #11775]',
794 'id' => 'EMBEDDED_IMG_11775',
798 array(
799 'DirectLex' => $start,
804 public function test_tokenizeHTML_prematureDivClose()
806 $this->assertTokenization(
807 '</div>dontdie',
808 array(
809 new HTMLPurifier_Token_End('div'),
810 new HTMLPurifier_Token_Text('dontdie')
812 array(
813 'DOMLex' => $alt = array(new HTMLPurifier_Token_Text('dontdie')),
814 'PH5P' => $alt
822 public function test_tokenizeHTML_()
824 $this->assertTokenization(
826 array(
835 // vim: et sw=4 sts=4