Fix extant broken PEARSax3 parsing patterns.
[htmlpurifier.git] / tests / HTMLPurifier / LexerTest.php
blob332559dd0604b288d32d0d4614e256a684e58759
1 <?php
3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
6 protected $_has_pear = false;
8 public function __construct() {
9 parent::__construct();
10 if ($GLOBALS['HTMLPurifierTest']['PEAR']) {
11 require_once 'HTMLPurifier/Lexer/PEARSax3.php';
12 $this->_has_pear = true;
14 if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
15 require_once 'HTMLPurifier/Lexer/PH5P.php';
19 // HTMLPurifier_Lexer::create() --------------------------------------------
21 function test_create() {
22 $this->config->set('Core.MaintainLineNumbers', true);
23 $lexer = HTMLPurifier_Lexer::create($this->config);
24 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
27 function test_create_objectLexerImpl() {
28 $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
29 $lexer = HTMLPurifier_Lexer::create($this->config);
30 $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
33 function test_create_unknownLexer() {
34 $this->config->set('Core.LexerImpl', 'AsdfAsdf');
35 $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
36 HTMLPurifier_Lexer::create($this->config);
39 function test_create_incompatibleLexer() {
40 $this->config->set('Core.LexerImpl', 'DOMLex');
41 $this->config->set('Core.MaintainLineNumbers', true);
42 $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
43 HTMLPurifier_Lexer::create($this->config);
46 // HTMLPurifier_Lexer->parseData() -----------------------------------------
48 function assertParseData($input, $expect = true) {
49 if ($expect === true) $expect = $input;
50 $lexer = new HTMLPurifier_Lexer();
51 $this->assertIdentical($expect, $lexer->parseData($input));
54 function test_parseData_plainText() {
55 $this->assertParseData('asdf');
58 function test_parseData_ampersandEntity() {
59 $this->assertParseData('&amp;', '&');
62 function test_parseData_quotEntity() {
63 $this->assertParseData('&quot;', '"');
66 function test_parseData_aposNumericEntity() {
67 $this->assertParseData('&#039;', "'");
70 function test_parseData_aposCompactNumericEntity() {
71 $this->assertParseData('&#39;', "'");
74 function test_parseData_adjacentAmpersandEntities() {
75 $this->assertParseData('&amp;&amp;&amp;', '&&&');
78 function test_parseData_trailingUnescapedAmpersand() {
79 $this->assertParseData('&amp;&', '&&');
82 function test_parseData_internalUnescapedAmpersand() {
83 $this->assertParseData('Procter & Gamble');
86 function test_parseData_improperEntityFaultToleranceTest() {
87 $this->assertParseData('&#x2D;');
90 // HTMLPurifier_Lexer->extractBody() ---------------------------------------
92 function assertExtractBody($text, $extract = true) {
93 $lexer = new HTMLPurifier_Lexer();
94 $result = $lexer->extractBody($text);
95 if ($extract === true) $extract = $text;
96 $this->assertIdentical($extract, $result);
99 function test_extractBody_noBodyTags() {
100 $this->assertExtractBody('<b>Bold</b>');
103 function test_extractBody_lowercaseBodyTags() {
104 $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
107 function test_extractBody_uppercaseBodyTags() {
108 $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
111 function test_extractBody_realisticUseCase() {
112 $this->assertExtractBody(
113 '<?xml version="1.0"
114 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
115 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
116 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
117 <head>
118 <title>xyz</title>
119 </head>
120 <body>
121 <form method="post" action="whatever1">
122 <div>
123 <input type="text" name="username" />
124 <input type="text" name="password" />
125 <input type="submit" />
126 </div>
127 </form>
128 </body>
129 </html>',
131 <form method="post" action="whatever1">
132 <div>
133 <input type="text" name="username" />
134 <input type="text" name="password" />
135 <input type="submit" />
136 </div>
137 </form>
141 function test_extractBody_bodyWithAttributes() {
142 $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
145 function test_extractBody_preserveUnclosedBody() {
146 $this->assertExtractBody('<body>asdf'); // not closed, don't accept
149 function test_extractBody_useLastBody() {
150 $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
153 // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
155 function assertTokenization($input, $expect, $alt_expect = array()) {
156 $lexers = array();
157 $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
158 if ($this->_has_pear) $lexers['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
159 if (class_exists('DOMDocument')) {
160 $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
161 $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
163 foreach ($lexers as $name => $lexer) {
164 $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
165 if (isset($alt_expect[$name])) {
166 if ($alt_expect[$name] === false) continue;
167 $t_expect = $alt_expect[$name];
168 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
169 } else {
170 $t_expect = $expect;
171 $this->assertIdentical($result, $expect, "$name: %s");
173 if ($t_expect != $result) {
174 printTokens($result);
179 function test_tokenizeHTML_emptyInput() {
180 $this->assertTokenization('', array());
183 function test_tokenizeHTML_plainText() {
184 $this->assertTokenization(
185 'This is regular text.',
186 array(
187 new HTMLPurifier_Token_Text('This is regular text.')
192 function test_tokenizeHTML_textAndTags() {
193 $this->assertTokenization(
194 'This is <b>bold</b> text',
195 array(
196 new HTMLPurifier_Token_Text('This is '),
197 new HTMLPurifier_Token_Start('b', array()),
198 new HTMLPurifier_Token_Text('bold'),
199 new HTMLPurifier_Token_End('b'),
200 new HTMLPurifier_Token_Text(' text'),
205 function test_tokenizeHTML_normalizeCase() {
206 $this->assertTokenization(
207 '<DIV>Totally rad dude. <b>asdf</b></div>',
208 array(
209 new HTMLPurifier_Token_Start('DIV', array()),
210 new HTMLPurifier_Token_Text('Totally rad dude. '),
211 new HTMLPurifier_Token_Start('b', array()),
212 new HTMLPurifier_Token_Text('asdf'),
213 new HTMLPurifier_Token_End('b'),
214 new HTMLPurifier_Token_End('div'),
219 function test_tokenizeHTML_notWellFormed() {
220 $this->assertTokenization(
221 '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
222 array(
223 new HTMLPurifier_Token_Start('asdf'),
224 new HTMLPurifier_Token_End('asdf'),
225 new HTMLPurifier_Token_Start('d'),
226 new HTMLPurifier_Token_End('d'),
227 new HTMLPurifier_Token_Start('poOloka'),
228 new HTMLPurifier_Token_Start('poolasdf'),
229 new HTMLPurifier_Token_Start('ds'),
230 new HTMLPurifier_Token_End('asdf'),
231 new HTMLPurifier_Token_End('ASDF'),
233 array(
234 'DOMLex' => $alt = array(
235 new HTMLPurifier_Token_Empty('asdf'),
236 new HTMLPurifier_Token_Empty('d'),
237 new HTMLPurifier_Token_Start('pooloka'),
238 new HTMLPurifier_Token_Start('poolasdf'),
239 new HTMLPurifier_Token_Empty('ds'),
240 new HTMLPurifier_Token_End('poolasdf'),
241 new HTMLPurifier_Token_End('pooloka'),
243 'PH5P' => $alt,
248 function test_tokenizeHTML_whitespaceInTag() {
249 $this->assertTokenization(
250 '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
251 array(
252 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
253 new HTMLPurifier_Token_Text('Link to '),
254 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
255 new HTMLPurifier_Token_Text('foobar'),
256 new HTMLPurifier_Token_End('b'),
257 new HTMLPurifier_Token_End('a'),
262 function test_tokenizeHTML_emptyTag() {
263 $this->assertTokenization(
264 '<br />',
265 array( new HTMLPurifier_Token_Empty('br') )
269 function test_tokenizeHTML_comment() {
270 $this->assertTokenization(
271 '<!-- Comment -->',
272 array( new HTMLPurifier_Token_Comment(' Comment ') )
276 function test_tokenizeHTML_malformedComment() {
277 $this->assertTokenization(
278 '<!-- not so well formed --->',
279 array( new HTMLPurifier_Token_Comment(' not so well formed -') )
283 function test_tokenizeHTML_unterminatedTag() {
284 $this->assertTokenization(
285 '<a href=""',
286 array( new HTMLPurifier_Token_Text('<a href=""') ),
287 array(
288 // I like our behavior better, but it's non-standard
289 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
290 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
291 'PH5P' => false, // total barfing, grabs scaffolding too
296 function test_tokenizeHTML_specialEntities() {
297 $this->assertTokenization(
298 '&lt;b&gt;',
299 array(
300 new HTMLPurifier_Token_Text('<b>')
302 array(
303 // some parsers will separate entities out
304 'PEARSax3' => $split = array(
305 new HTMLPurifier_Token_Text('<'),
306 new HTMLPurifier_Token_Text('b'),
307 new HTMLPurifier_Token_Text('>'),
309 'PH5P' => $split,
314 function test_tokenizeHTML_earlyQuote() {
315 $this->assertTokenization(
316 '<a "=>',
317 array( new HTMLPurifier_Token_Empty('a') ),
318 array(
319 // we barf on this input
320 'DirectLex' => $tokens = array(
321 new HTMLPurifier_Token_Start('a', array('"' => ''))
323 'PEARSax3' => $tokens,
324 'PH5P' => false, // behavior varies; handle this personally
329 function test_tokenizeHTML_earlyQuote_PH5P() {
330 if (!class_exists('DOMDocument')) return;
331 $lexer = new HTMLPurifier_Lexer_PH5P();
332 $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
333 if ($this->context->get('PH5PError', true)) {
334 $this->assertIdentical(array(
335 new HTMLPurifier_Token_Start('a', array('"' => ''))
336 ), $result);
337 } else {
338 $this->assertIdentical(array(
339 new HTMLPurifier_Token_Empty('a', array('"' => ''))
340 ), $result);
344 function test_tokenizeHTML_unescapedQuote() {
345 $this->assertTokenization(
346 '"',
347 array( new HTMLPurifier_Token_Text('"') )
351 function test_tokenizeHTML_escapedQuote() {
352 $this->assertTokenization(
353 '&quot;',
354 array( new HTMLPurifier_Token_Text('"') ),
355 array(
356 'PEARSax3' => false, // PEAR barfs on this
361 function test_tokenizeHTML_cdata() {
362 $this->assertTokenization(
363 '<![CDATA[You <b>can&#39;t</b> get me!]]>',
364 array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
365 array(
366 // PEAR splits up all of the CDATA
367 'PEARSax3' => $split = array(
368 new HTMLPurifier_Token_Text('You '),
369 new HTMLPurifier_Token_Text('<'),
370 new HTMLPurifier_Token_Text('b'),
371 new HTMLPurifier_Token_Text('>'),
372 new HTMLPurifier_Token_Text('can'),
373 new HTMLPurifier_Token_Text('&'),
374 new HTMLPurifier_Token_Text('#39;t'),
375 new HTMLPurifier_Token_Text('<'),
376 new HTMLPurifier_Token_Text('/b'),
377 new HTMLPurifier_Token_Text('>'),
378 new HTMLPurifier_Token_Text(' get me!'),
380 'PH5P' => $split,
385 function test_tokenizeHTML_characterEntity() {
386 $this->assertTokenization(
387 '&theta;',
388 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
392 function test_tokenizeHTML_characterEntityInCDATA() {
393 $this->assertTokenization(
394 '<![CDATA[&rarr;]]>',
395 array( new HTMLPurifier_Token_Text("&rarr;") ),
396 array(
397 'PEARSax3' => $split = array(
398 new HTMLPurifier_Token_Text('&'),
399 new HTMLPurifier_Token_Text('rarr;'),
401 'PH5P' => $split,
406 function test_tokenizeHTML_entityInAttribute() {
407 $this->assertTokenization(
408 '<a href="index.php?title=foo&amp;id=bar">Link</a>',
409 array(
410 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
411 new HTMLPurifier_Token_Text('Link'),
412 new HTMLPurifier_Token_End('a'),
417 function test_tokenizeHTML_preserveUTF8() {
418 $this->assertTokenization(
419 "\xCE\xB8",
420 array( new HTMLPurifier_Token_Text("\xCE\xB8") )
424 function test_tokenizeHTML_specialEntityInAttribute() {
425 $this->assertTokenization(
426 '<br test="x &lt; 6" />',
427 array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
431 function test_tokenizeHTML_emoticonProtection() {
432 $this->assertTokenization(
433 '<b>Whoa! <3 That\'s not good >.></b>',
434 array(
435 new HTMLPurifier_Token_Start('b'),
436 new HTMLPurifier_Token_Text('Whoa! '),
437 new HTMLPurifier_Token_Text('<'),
438 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
439 new HTMLPurifier_Token_End('b')
441 array(
442 // text is absorbed together
443 'DOMLex' => array(
444 new HTMLPurifier_Token_Start('b'),
445 new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
446 new HTMLPurifier_Token_End('b'),
448 'PEARSax3' => false, // totally mangled
449 'PH5P' => array( // interesting grouping
450 new HTMLPurifier_Token_Start('b'),
451 new HTMLPurifier_Token_Text('Whoa! '),
452 new HTMLPurifier_Token_Text('<'),
453 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
454 new HTMLPurifier_Token_End('b'),
460 function test_tokenizeHTML_commentWithFunkyChars() {
461 $this->assertTokenization(
462 '<!-- This >< comment --><br />',
463 array(
464 new HTMLPurifier_Token_Comment(' This >< comment '),
465 new HTMLPurifier_Token_Empty('br'),
467 array(
468 'PEARSax3' => false,
473 function test_tokenizeHTML_unterminatedComment() {
474 $this->assertTokenization(
475 '<!-- This >< comment',
476 array( new HTMLPurifier_Token_Comment(' This >< comment') ),
477 array(
478 'DOMLex' => false,
479 'PEARSax3' => false,
480 'PH5P' => false,
485 function test_tokenizeHTML_scriptCDATAContents() {
486 $this->config->set('HTML.Trusted', true);
487 $this->assertTokenization(
488 'Foo: <script>alert("<foo>");</script>',
489 array(
490 new HTMLPurifier_Token_Text('Foo: '),
491 new HTMLPurifier_Token_Start('script'),
492 new HTMLPurifier_Token_Text('alert("<foo>");'),
493 new HTMLPurifier_Token_End('script'),
495 array(
496 'PEARSax3' => false,
497 // PH5P, for some reason, bubbles the script to <head>
498 'PH5P' => false,
503 function test_tokenizeHTML_entitiesInComment() {
504 $this->assertTokenization(
505 '<!-- This comment < &lt; & -->',
506 array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
507 array(
508 'PEARSax3' => false
513 function test_tokenizeHTML_attributeWithSpecialCharacters() {
514 $this->assertTokenization(
515 '<a href="><>">',
516 array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
517 array(
518 'DirectLex' => array(
519 new HTMLPurifier_Token_Start('a', array('href' => '')),
520 new HTMLPurifier_Token_Text('<'),
521 new HTMLPurifier_Token_Text('">'),
523 'PEARSax3' => false,
528 function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
529 $this->assertTokenization(
530 '<param name="src" value="http://example.com/video.wmv" />',
531 array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
535 function test_tokenizeHTML_style() {
536 $extra = array(
537 // PH5P doesn't seem to like style tags
538 'PH5P' => false,
539 // DirectLex defers to RemoveForeignElements for textification
540 'DirectLex' => array(
541 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
542 new HTMLPurifier_Token_Comment("\ndiv {}\n"),
543 new HTMLPurifier_Token_End('style'),
546 if (!defined('LIBXML_VERSION')) {
547 // LIBXML_VERSION is missing in early versions of PHP
548 // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
549 // this translates to 5.0.x. In such cases, punt the test entirely.
550 return;
551 } elseif (LIBXML_VERSION < 20628) {
552 // libxml's behavior is wrong prior to this version, so make
553 // appropriate accomodations
554 $extra['DOMLex'] = $extra['DirectLex'];
556 $this->assertTokenization(
557 '<style type="text/css"><!--
558 div {}
559 --></style>',
560 array(
561 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
562 new HTMLPurifier_Token_Text("\ndiv {}\n"),
563 new HTMLPurifier_Token_End('style'),
565 $extra
569 function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
570 $alt_expect = array(
571 // Technically this is invalid, but it won't be a
572 // problem with invalid element removal; also, this
573 // mimics Mozilla's parsing of the tag.
574 new HTMLPurifier_Token_Start('a@'),
575 new HTMLPurifier_Token_Text('>'),
577 $this->assertTokenization(
578 '<a@>>',
579 array(
580 new HTMLPurifier_Token_Start('a'),
581 new HTMLPurifier_Token_Text('>'),
582 new HTMLPurifier_Token_End('a'),
584 array(
585 'DirectLex' => $alt_expect,
586 'PEARSax3' => $alt_expect,
591 function test_tokenizeHTML_emoticonHeart() {
592 $this->assertTokenization(
593 '<br /><3<br />',
594 array(
595 new HTMLPurifier_Token_Empty('br'),
596 new HTMLPurifier_Token_Text('<'),
597 new HTMLPurifier_Token_Text('3'),
598 new HTMLPurifier_Token_Empty('br'),
600 array(
601 'DOMLex' => array(
602 new HTMLPurifier_Token_Empty('br'),
603 new HTMLPurifier_Token_Text('<3'),
604 new HTMLPurifier_Token_Empty('br'),
606 'PEARSax3' => array(
607 // bah too lazy to fix this
608 new HTMLPurifier_Token_Empty('br'),
609 new HTMLPurifier_Token_Empty('3<br'),
615 function test_tokenizeHTML_emoticonShiftyEyes() {
616 $this->assertTokenization(
617 '<b><<</b>',
618 array(
619 new HTMLPurifier_Token_Start('b'),
620 new HTMLPurifier_Token_Text('<'),
621 new HTMLPurifier_Token_Text('<'),
622 new HTMLPurifier_Token_End('b'),
624 array(
625 'DOMLex' => array(
626 new HTMLPurifier_Token_Start('b'),
627 new HTMLPurifier_Token_Text('<<'),
628 new HTMLPurifier_Token_End('b'),
630 'PEARSax3' => array(
631 // also too lazy to fix
632 new HTMLPurifier_Token_Start('b'),
633 new HTMLPurifier_Token_Empty('<<'),
634 new HTMLPurifier_Token_Text('b>'),
640 function test_tokenizeHTML_eon1996() {
641 $this->assertTokenization(
642 '< <b>test</b>',
643 array(
644 new HTMLPurifier_Token_Text('<'),
645 new HTMLPurifier_Token_Text(' '),
646 new HTMLPurifier_Token_Start('b'),
647 new HTMLPurifier_Token_Text('test'),
648 new HTMLPurifier_Token_End('b'),
650 array(
651 'DOMLex' => array(
652 new HTMLPurifier_Token_Text('< '),
653 new HTMLPurifier_Token_Start('b'),
654 new HTMLPurifier_Token_Text('test'),
655 new HTMLPurifier_Token_End('b'),
657 'PEARSax3' => array(
658 // totally doing the wrong thing here
659 new HTMLPurifier_Token_Text(' '),
660 new HTMLPurifier_Token_Start('b'),
661 new HTMLPurifier_Token_Text('test'),
662 new HTMLPurifier_Token_End('b'),
668 function test_tokenizeHTML_bodyInCDATA() {
669 $alt_tokens = array(
670 new HTMLPurifier_Token_Text('<'),
671 new HTMLPurifier_Token_Text('body'),
672 new HTMLPurifier_Token_Text('>'),
673 new HTMLPurifier_Token_Text('Foo'),
674 new HTMLPurifier_Token_Text('<'),
675 new HTMLPurifier_Token_Text('/body'),
676 new HTMLPurifier_Token_Text('>'),
678 $this->assertTokenization(
679 '<![CDATA[<body>Foo</body>]]>',
680 array(
681 new HTMLPurifier_Token_Text('<body>Foo</body>'),
683 array(
684 'PH5P' => $alt_tokens,
685 'PEARSax3' => $alt_tokens,
690 function test_tokenizeHTML_() {
691 $this->assertTokenization(
692 '<a><img /></a>',
693 array(
694 new HTMLPurifier_Token_Start('a'),
695 new HTMLPurifier_Token_Empty('img'),
696 new HTMLPurifier_Token_End('a'),
703 function test_tokenizeHTML_() {
704 $this->assertTokenization(
706 array(
715 // vim: et sw=4 sts=4