3 class HTMLPurifier_LexerTest
extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct()
10 parent
::__construct();
11 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
12 require_once 'HTMLPurifier/Lexer/PH5P.php';
16 // HTMLPurifier_Lexer::create() --------------------------------------------
18 public function test_create()
20 $this->config
->set('Core.MaintainLineNumbers', true);
21 $lexer = HTMLPurifier_Lexer
::create($this->config
);
22 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
25 public function test_create_objectLexerImpl()
27 $this->config
->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
28 $lexer = HTMLPurifier_Lexer
::create($this->config
);
29 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
32 public function test_create_unknownLexer()
34 $this->config
->set('Core.LexerImpl', 'AsdfAsdf');
35 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
36 HTMLPurifier_Lexer
::create($this->config
);
39 public function test_create_incompatibleLexer()
41 $this->config
->set('Core.LexerImpl', 'DOMLex');
42 $this->config
->set('Core.MaintainLineNumbers', true);
43 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
44 HTMLPurifier_Lexer
::create($this->config
);
47 // HTMLPurifier_Lexer->parseData() -----------------------------------------
49 public function assertParseData($input, $expect = true, $is_attr = false)
51 if ($expect === true) $expect = $input;
52 $lexer = new HTMLPurifier_Lexer();
53 $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config
));
56 public function test_parseData_plainText()
58 $this->assertParseData('asdf');
61 public function test_parseData_ampersandEntity()
63 $this->assertParseData('&', '&');
66 public function test_parseData_quotEntity()
68 $this->assertParseData('"', '"');
71 public function test_parseData_aposNumericEntity()
73 $this->assertParseData(''', "'");
76 public function test_parseData_aposCompactNumericEntity()
78 $this->assertParseData(''', "'");
81 public function test_parseData_adjacentAmpersandEntities()
83 $this->assertParseData('&&&', '&&&');
86 public function test_parseData_trailingUnescapedAmpersand()
88 $this->assertParseData('&&', '&&');
91 public function test_parseData_internalUnescapedAmpersand()
93 $this->assertParseData('Procter & Gamble');
96 public function test_parseData_improperEntityFaultToleranceTest()
98 $this->assertParseData('-', '-');
101 public function test_parseData_noTrailingSemi()
103 $this->assertParseData('&A', '&A');
106 public function test_parseData_noTrailingSemiAttr()
108 $this->assertParseData('&A', '&A', true);
111 public function test_parseData_T119()
113 $this->assertParseData('&A', '&A', true);
116 public function test_parseData_T119b()
118 $this->assertParseData('&trade=', true, true);
121 public function test_parseData_legacy1()
123 $this->config
->set('Core.LegacyEntityDecoder', true);
124 $this->assertParseData('&a', true);
125 $this->assertParseData('&=', "&=");
126 $this->assertParseData('&a', true, true);
127 $this->assertParseData('&=', "&=", true);
128 $this->assertParseData('<a', true);
129 $this->assertParseData('<=', "<=");
130 $this->assertParseData('<a', true, true);
131 $this->assertParseData('<=', "<=", true);
134 public function test_parseData_nonlegacy1()
136 $this->assertParseData('&a', "&a");
137 $this->assertParseData('&=', "&=");
138 $this->assertParseData('&a', true, true);
139 $this->assertParseData('&=', true, true);
140 $this->assertParseData('<a', "<a");
141 $this->assertParseData('<=', "<=");
142 $this->assertParseData('<a', true, true);
143 $this->assertParseData('<=', true, true);
144 $this->assertParseData('<a;', "<a;");
147 public function test_parseData_noTrailingSemiNever()
149 $this->assertParseData('&imath');
152 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
154 public function assertExtractBody($text, $extract = true)
156 $lexer = new HTMLPurifier_Lexer();
157 $result = $lexer->extractBody($text);
158 if ($extract === true) $extract = $text;
159 $this->assertIdentical($extract, $result);
162 public function test_extractBody_noBodyTags()
164 $this->assertExtractBody('<b>Bold</b>');
167 public function test_extractBody_lowercaseBodyTags()
169 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
172 public function test_extractBody_uppercaseBodyTags()
174 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
177 public function test_extractBody_realisticUseCase()
179 $this->assertExtractBody(
181 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
182 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
183 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
188 <form method="post" action="whatever1">
190 <input type="text" name="username" />
191 <input type="text" name="password" />
192 <input type="submit" />
198 <form method="post" action="whatever1">
200 <input type="text" name="username" />
201 <input type="text" name="password" />
202 <input type="submit" />
208 public function test_extractBody_bodyWithAttributes()
210 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
213 public function test_extractBody_preserveUnclosedBody()
215 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
218 public function test_extractBody_useLastBody()
220 $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
223 public function test_extractBody_ignoreCommented()
225 $this->assertExtractBody('$<!-- <body>foo</body> -->^');
228 public function test_extractBody_butCanStillWork()
230 $this->assertExtractBody('<!-- b --><body>a</body>', 'a');
233 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
235 public function assertTokenization($input, $expect, $alt_expect = array())
238 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
239 if (class_exists('DOMDocument')) {
240 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
241 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
243 foreach ($lexers as $name => $lexer) {
244 $result = $lexer->tokenizeHTML($input, $this->config
, $this->context
);
245 if (isset($alt_expect[$name])) {
246 if ($alt_expect[$name] === false) continue;
247 $t_expect = $alt_expect[$name];
248 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
251 $this->assertIdentical($result, $expect, "$name: %s");
253 if ($t_expect != $result) {
254 printTokens($result);
259 public function test_tokenizeHTML_emptyInput()
261 $this->assertTokenization('', array());
264 public function test_tokenizeHTML_plainText()
266 $this->assertTokenization(
267 'This is regular text.',
269 new HTMLPurifier_Token_Text('This is regular text.')
274 public function test_tokenizeHTML_textAndTags()
276 $this->assertTokenization(
277 'This is <b>bold</b> text',
279 new HTMLPurifier_Token_Text('This is '),
280 new HTMLPurifier_Token_Start('b', array()),
281 new HTMLPurifier_Token_Text('bold'),
282 new HTMLPurifier_Token_End('b'),
283 new HTMLPurifier_Token_Text(' text'),
288 public function test_tokenizeHTML_normalizeCase()
290 $this->assertTokenization(
291 '<DIV>Totally rad dude. <b>asdf</b></div>',
293 new HTMLPurifier_Token_Start('DIV', array()),
294 new HTMLPurifier_Token_Text('Totally rad dude. '),
295 new HTMLPurifier_Token_Start('b', array()),
296 new HTMLPurifier_Token_Text('asdf'),
297 new HTMLPurifier_Token_End('b'),
298 new HTMLPurifier_Token_End('div'),
303 public function test_tokenizeHTML_notWellFormed()
305 $this->assertTokenization(
306 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
308 new HTMLPurifier_Token_Start('asdf'),
309 new HTMLPurifier_Token_End('asdf'),
310 new HTMLPurifier_Token_Start('d'),
311 new HTMLPurifier_Token_End('d'),
312 new HTMLPurifier_Token_Start('poOloka'),
313 new HTMLPurifier_Token_Start('poolasdf'),
314 new HTMLPurifier_Token_Start('ds'),
315 new HTMLPurifier_Token_End('asdf'),
316 new HTMLPurifier_Token_End('ASDF'),
319 'DOMLex' => $alt = array(
320 new HTMLPurifier_Token_Empty('asdf'),
321 new HTMLPurifier_Token_Empty('d'),
322 new HTMLPurifier_Token_Start('pooloka'),
323 new HTMLPurifier_Token_Start('poolasdf'),
324 new HTMLPurifier_Token_Empty('ds'),
325 new HTMLPurifier_Token_End('poolasdf'),
326 new HTMLPurifier_Token_End('pooloka'),
328 // 20140831: Weird, but whatever...
329 'PH5P' => array(new HTMLPurifier_Token_Empty('asdf')),
334 public function test_tokenizeHTML_whitespaceInTag()
336 $this->assertTokenization(
337 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
339 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
340 new HTMLPurifier_Token_Text('Link to '),
341 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
342 new HTMLPurifier_Token_Text('foobar'),
343 new HTMLPurifier_Token_End('b'),
344 new HTMLPurifier_Token_End('a'),
349 public function test_tokenizeHTML_singleAttribute()
351 $this->assertTokenization(
352 '<br style="&" />',
354 new HTMLPurifier_Token_Empty('br', array('style' => '&'))
359 public function test_tokenizeHTML_emptyTag()
361 $this->assertTokenization(
363 array( new HTMLPurifier_Token_Empty('br') )
367 public function test_tokenizeHTML_comment()
369 $this->assertTokenization(
371 array( new HTMLPurifier_Token_Comment(' Comment ') )
375 public function test_tokenizeHTML_malformedComment()
377 $this->assertTokenization(
378 '<!-- not so well formed --->',
379 array( new HTMLPurifier_Token_Comment(' not so well formed -') )
383 public function test_tokenizeHTML_unterminatedTag()
385 $this->assertTokenization(
387 array( new HTMLPurifier_Token_Text('<a href=""') ),
389 // I like our behavior better, but it's non-standard
390 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
391 'PH5P' => false, // total barfing, grabs scaffolding too
396 public function test_tokenizeHTML_specialEntities()
398 $this->assertTokenization(
401 new HTMLPurifier_Token_Text('<b>')
404 // some parsers will separate entities out
406 new HTMLPurifier_Token_Text('<'),
407 new HTMLPurifier_Token_Text('b'),
408 new HTMLPurifier_Token_Text('>'),
414 public function test_tokenizeHTML_earlyQuote()
416 $this->assertTokenization(
418 array( new HTMLPurifier_Token_Empty('a') ),
420 // we barf on this input
421 'DirectLex' => array(
422 new HTMLPurifier_Token_Start('a', array('"' => ''))
424 'PH5P' => false, // behavior varies; handle this personally
429 public function test_tokenizeHTML_earlyQuote_PH5P()
431 if (!class_exists('DOMDocument')) return;
432 $lexer = new HTMLPurifier_Lexer_PH5P();
433 $result = $lexer->tokenizeHTML('<a "=>', $this->config
, $this->context
);
434 if ($this->context
->get('PH5PError', true)) {
435 $this->assertIdentical(array(
436 new HTMLPurifier_Token_Start('a', array('"' => ''))
439 $this->assertIdentical(array(
440 new HTMLPurifier_Token_Empty('a', array('"' => ''))
445 public function test_tokenizeHTML_unescapedQuote()
447 $this->assertTokenization(
449 array( new HTMLPurifier_Token_Text('"') )
453 public function test_tokenizeHTML_escapedQuote()
455 $this->assertTokenization(
457 array( new HTMLPurifier_Token_Text('"') )
461 public function test_tokenizeHTML_cdata()
463 $this->assertTokenization(
464 '<![CDATA[You <b>can't</b> get me!]]>',
465 array( new HTMLPurifier_Token_Text('You <b>can't</b> get me!') ),
468 new HTMLPurifier_Token_Text('You '),
469 new HTMLPurifier_Token_Text('<'),
470 new HTMLPurifier_Token_Text('b'),
471 new HTMLPurifier_Token_Text('>'),
472 new HTMLPurifier_Token_Text('can'),
473 new HTMLPurifier_Token_Text('&'),
474 new HTMLPurifier_Token_Text('#39;t'),
475 new HTMLPurifier_Token_Text('<'),
476 new HTMLPurifier_Token_Text('/b'),
477 new HTMLPurifier_Token_Text('>'),
478 new HTMLPurifier_Token_Text(' get me!'),
484 public function test_tokenizeHTML_characterEntity()
486 $this->assertTokenization(
488 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
492 public function test_tokenizeHTML_characterEntityInCDATA()
494 $this->assertTokenization(
495 '<![CDATA[→]]>',
496 array( new HTMLPurifier_Token_Text("→") ),
499 new HTMLPurifier_Token_Text('&'),
500 new HTMLPurifier_Token_Text('rarr;'),
506 public function test_tokenizeHTML_entityInAttribute()
508 $this->assertTokenization(
509 '<a href="index.php?title=foo&id=bar">Link</a>',
511 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
512 new HTMLPurifier_Token_Text('Link'),
513 new HTMLPurifier_Token_End('a'),
518 public function test_tokenizeHTML_preserveUTF8()
520 $this->assertTokenization(
522 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
526 public function test_tokenizeHTML_specialEntityInAttribute()
528 $this->assertTokenization(
529 '<br test="x < 6" />',
530 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
534 public function test_tokenizeHTML_emoticonProtection()
536 $this->assertTokenization(
537 '<b>Whoa! <3 That\'s not good >.></b>',
539 new HTMLPurifier_Token_Start('b'),
540 new HTMLPurifier_Token_Text('Whoa! '),
541 new HTMLPurifier_Token_Text('<'),
542 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
543 new HTMLPurifier_Token_End('b')
546 // text is absorbed together
548 new HTMLPurifier_Token_Start('b'),
549 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
550 new HTMLPurifier_Token_End('b'),
552 'PH5P' => array( // interesting grouping
553 new HTMLPurifier_Token_Start('b'),
554 new HTMLPurifier_Token_Text('Whoa! '),
555 new HTMLPurifier_Token_Text('<'),
556 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
557 new HTMLPurifier_Token_End('b'),
563 public function test_tokenizeHTML_commentWithFunkyChars()
565 $this->assertTokenization(
566 '<!-- This >< comment --><br />',
568 new HTMLPurifier_Token_Comment(' This >< comment '),
569 new HTMLPurifier_Token_Empty('br'),
574 public function test_tokenizeHTML_unterminatedComment()
576 $this->assertTokenization(
577 '<!-- This >< comment',
578 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
586 public function test_tokenizeHTML_scriptCDATAContents()
588 $this->config
->set('HTML.Trusted', true);
589 $this->assertTokenization(
590 'Foo: <script>alert("<foo>");</script>',
592 new HTMLPurifier_Token_Text('Foo: '),
593 new HTMLPurifier_Token_Start('script'),
594 new HTMLPurifier_Token_Text('alert("<foo>");'),
595 new HTMLPurifier_Token_End('script'),
598 // PH5P, for some reason, bubbles the script to <head>
604 public function test_tokenizeHTML_entitiesInComment()
606 $this->assertTokenization(
607 '<!-- This comment < < & -->',
608 array( new HTMLPurifier_Token_Comment(' This comment < < & ') )
612 public function test_tokenizeHTML_attributeWithSpecialCharacters()
614 $this->assertTokenization(
616 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
618 'DirectLex' => array(
619 new HTMLPurifier_Token_Start('a', array('href' => '')),
620 new HTMLPurifier_Token_Text('<'),
621 new HTMLPurifier_Token_Text('">'),
627 public function test_tokenizeHTML_emptyTagWithSlashInAttribute()
629 $this->assertTokenization(
630 '<param name="src" value="http://example.com/video.wmv" />',
631 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
635 public function test_tokenizeHTML_style()
638 // PH5P doesn't seem to like style tags
640 // DirectLex defers to RemoveForeignElements for textification
641 'DirectLex' => array(
642 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
643 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
644 new HTMLPurifier_Token_End('style'),
647 if (!defined('LIBXML_VERSION')) {
648 // LIBXML_VERSION is missing in early versions of PHP
649 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
650 // this translates to 5.0.x. In such cases, punt the test entirely.
652 } elseif (LIBXML_VERSION
< 20628) {
653 // libxml's behavior is wrong prior to this version, so make
654 // appropriate accomodations
655 $extra['DOMLex'] = $extra['DirectLex'];
657 $this->assertTokenization(
658 '<style type="text/css"><!--
662 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
663 new HTMLPurifier_Token_Text("\ndiv {}\n"),
664 new HTMLPurifier_Token_End('style'),
670 public function test_tokenizeHTML_tagWithAtSignAndExtraGt()
673 // Technically this is invalid, but it won't be a
674 // problem with invalid element removal; also, this
675 // mimics Mozilla's parsing of the tag.
676 new HTMLPurifier_Token_Start('a@'),
677 new HTMLPurifier_Token_Text('>'),
679 $this->assertTokenization(
682 new HTMLPurifier_Token_Start('a'),
683 new HTMLPurifier_Token_Text('>'),
684 new HTMLPurifier_Token_End('a'),
687 'DirectLex' => $alt_expect,
692 public function test_tokenizeHTML_emoticonHeart()
694 $this->assertTokenization(
697 new HTMLPurifier_Token_Empty('br'),
698 new HTMLPurifier_Token_Text('<'),
699 new HTMLPurifier_Token_Text('3'),
700 new HTMLPurifier_Token_Empty('br'),
704 new HTMLPurifier_Token_Empty('br'),
705 new HTMLPurifier_Token_Text('<3'),
706 new HTMLPurifier_Token_Empty('br'),
712 public function test_tokenizeHTML_emoticonShiftyEyes()
714 $this->assertTokenization(
717 new HTMLPurifier_Token_Start('b'),
718 new HTMLPurifier_Token_Text('<'),
719 new HTMLPurifier_Token_Text('<'),
720 new HTMLPurifier_Token_End('b'),
724 new HTMLPurifier_Token_Start('b'),
725 new HTMLPurifier_Token_Text('<<'),
726 new HTMLPurifier_Token_End('b'),
732 public function test_tokenizeHTML_eon1996()
734 $this->assertTokenization(
737 new HTMLPurifier_Token_Text('<'),
738 new HTMLPurifier_Token_Text(' '),
739 new HTMLPurifier_Token_Start('b'),
740 new HTMLPurifier_Token_Text('test'),
741 new HTMLPurifier_Token_End('b'),
745 new HTMLPurifier_Token_Text('< '),
746 new HTMLPurifier_Token_Start('b'),
747 new HTMLPurifier_Token_Text('test'),
748 new HTMLPurifier_Token_End('b'),
754 public function test_tokenizeHTML_bodyInCDATA()
757 new HTMLPurifier_Token_Text('<'),
758 new HTMLPurifier_Token_Text('body'),
759 new HTMLPurifier_Token_Text('>'),
760 new HTMLPurifier_Token_Text('Foo'),
761 new HTMLPurifier_Token_Text('<'),
762 new HTMLPurifier_Token_Text('/body'),
763 new HTMLPurifier_Token_Text('>'),
765 $this->assertTokenization(
766 '<![CDATA[<body>Foo</body>]]>',
768 new HTMLPurifier_Token_Text('<body>Foo</body>'),
771 'PH5P' => $alt_tokens,
776 public function test_tokenizeHTML_()
778 $this->assertTokenization(
781 new HTMLPurifier_Token_Start('a'),
782 new HTMLPurifier_Token_Empty('img'),
783 new HTMLPurifier_Token_End('a'),
788 public function test_tokenizeHTML_ignoreIECondComment()
790 $this->assertTokenization(
791 '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
796 public function test_tokenizeHTML_removeProcessingInstruction()
798 $this->config
->set('Core.RemoveProcessingInstructions', true);
799 $this->assertTokenization(
800 '<?xml blah blah ?>',
805 public function test_tokenizeHTML_removeNewline()
807 $this->config
->set('Core.NormalizeNewlines', true);
808 $this->assertTokenization(
811 new HTMLPurifier_Token_Text("plain\ntext\n")
816 public function test_tokenizeHTML_noRemoveNewline()
818 $this->config
->set('Core.NormalizeNewlines', false);
819 $this->assertTokenization(
822 new HTMLPurifier_Token_Text("plain\rtext\r\n")
827 public function test_tokenizeHTML_conditionalCommentUngreedy()
829 $this->assertTokenization(
830 '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
832 new HTMLPurifier_Token_Text("b")
837 public function test_tokenizeHTML_imgTag()
840 new HTMLPurifier_Token_Start('img',
842 'src' => 'img_11775.jpg',
843 'alt' => '[Img #11775]',
844 'id' => 'EMBEDDED_IMG_11775',
848 $this->assertTokenization(
849 '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
851 new HTMLPurifier_Token_Empty('img',
853 'src' => 'img_11775.jpg',
854 'alt' => '[Img #11775]',
855 'id' => 'EMBEDDED_IMG_11775',
860 'DirectLex' => $start,
865 public function test_tokenizeHTML_prematureDivClose()
867 $this->assertTokenization(
868 '</div>dont<b>die</b>',
870 new HTMLPurifier_Token_End('div'),
871 new HTMLPurifier_Token_Text('dont'),
872 new HTMLPurifier_Token_Start('b'),
873 new HTMLPurifier_Token_Text('die'),
874 new HTMLPurifier_Token_End('b'),
877 'DOMLex' => $alt = array(
878 new HTMLPurifier_Token_Text('dont'),
879 new HTMLPurifier_Token_Start('b'),
880 new HTMLPurifier_Token_Text('die'),
881 new HTMLPurifier_Token_End('b')
891 public function test_tokenizeHTML_()
893 $this->assertTokenization(
904 // vim: et sw=4 sts=4