tests/HTMLPurifier/LexerTest.php

   1 <?php
   2
   3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
   4 {
   5
   6     protected $_has_pear = false;
   7
   8     function HTMLPurifier_LexerTest() {
   9         parent::HTMLPurifier_Harness();
  10         // E_STRICT = 2048, int used for PHP4 compat: this check disables
  11         // PEAR if PHP 5 strict mode is on, since the class is not strict safe
  12         if (
  13             $GLOBALS['HTMLPurifierTest']['PEAR'] &&
  14             ((error_reporting() & 2048) != 2048) // ought to be a better way
  15         ) {
  16             require_once 'HTMLPurifier/Lexer/PEARSax3.php';
  17             $this->_has_pear = true;
  18         }
  19         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  20             require_once 'HTMLPurifier/Lexer/PH5P.php';
  21         }
  22     }
  23
  24     // HTMLPurifier_Lexer::create() --------------------------------------------
  25
  26     function test_create() {
  27         $this->config->set('Core', 'MaintainLineNumbers', true);
  28         $lexer = HTMLPurifier_Lexer::create($this->config);
  29         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  30     }
  31
  32     // HTMLPurifier_Lexer->parseData() -----------------------------------------
  33
  34     function assertParseData($input, $expect = true) {
  35         if ($expect === true) $expect = $input;
  36         $lexer = new HTMLPurifier_Lexer();
  37         $this->assertIdentical($expect, $lexer->parseData($input));
  38     }
  39
  40     function test_parseData_plainText() {
  41         $this->assertParseData('asdf');
  42     }
  43
  44     function test_parseData_ampersandEntity() {
  45         $this->assertParseData('&amp;', '&');
  46     }
  47
  48     function test_parseData_quotEntity() {
  49         $this->assertParseData('&quot;', '"');
  50     }
  51
  52     function test_parseData_aposNumericEntity() {
  53         $this->assertParseData('&#039;', "'");
  54     }
  55
  56     function test_parseData_aposCompactNumericEntity() {
  57         $this->assertParseData('&#39;', "'");
  58     }
  59
  60     function test_parseData_adjacentAmpersandEntities() {
  61         $this->assertParseData('&amp;&amp;&amp;', '&&&');
  62     }
  63
  64     function test_parseData_trailingUnescapedAmpersand() {
  65         $this->assertParseData('&amp;&', '&&');
  66     }
  67
  68     function test_parseData_internalUnescapedAmpersand() {
  69         $this->assertParseData('Procter & Gamble');
  70     }
  71
  72     function test_parseData_improperEntityFaultToleranceTest() {
  73         $this->assertParseData('&#x2D;');
  74     }
  75
  76     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  77
  78     function assertExtractBody($text, $extract = true) {
  79         $lexer = new HTMLPurifier_Lexer();
  80         $result = $lexer->extractBody($text);
  81         if ($extract === true) $extract = $text;
  82         $this->assertIdentical($extract, $result);
  83     }
  84
  85     function test_extractBody_noBodyTags() {
  86         $this->assertExtractBody('<b>Bold</b>');
  87     }
  88
  89     function test_extractBody_lowercaseBodyTags() {
  90         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
  91     }
  92
  93     function test_extractBody_uppercaseBodyTags() {
  94         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
  95     }
  96
  97     function test_extractBody_realisticUseCase() {
  98         $this->assertExtractBody(
  99 '<?xml version="1.0"
 100 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 101     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 102 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 103    <head>
 104       <title>xyz</title>
 105    </head>
 106    <body>
 107       <form method="post" action="whatever1">
 108          <div>
 109             <input type="text" name="username" />
 110             <input type="text" name="password" />
 111             <input type="submit" />
 112          </div>
 113       </form>
 114    </body>
 115 </html>',
 116     '
 117       <form method="post" action="whatever1">
 118          <div>
 119             <input type="text" name="username" />
 120             <input type="text" name="password" />
 121             <input type="submit" />
 122          </div>
 123       </form>
 124    ');
 125     }
 126
 127     function test_extractBody_bodyWithAttributes() {
 128         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 129     }
 130
 131     function test_extractBody_preserveUnclosedBody() {
 132         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 133     }
 134
 135     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 136
 137     function assertTokenization($input, $expect, $alt_expect = array()) {
 138         $lexers = array();
 139         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 140         if ($this->_has_pear) $lexers['PEARSax3']   = new HTMLPurifier_Lexer_PEARSax3();
 141         if (version_compare(PHP_VERSION, "5", ">=") && class_exists('DOMDocument')) {
 142             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 143             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 144         }
 145         foreach ($lexers as $name => $lexer) {
 146             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 147             if (isset($alt_expect[$name])) {
 148                 if ($alt_expect[$name] === false) continue;
 149                 $t_expect = $alt_expect[$name];
 150                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 151             } else {
 152                 $t_expect = $expect;
 153                 $this->assertIdentical($result, $expect, "$name: %s");
 154             }
 155             if ($t_expect != $result) {
 156                 printTokens($result);
 157                 //var_dump($result);
 158             }
 159         }
 160     }
 161
 162     function test_tokenizeHTML_emptyInput() {
 163         $this->assertTokenization('', array());
 164     }
 165
 166     function test_tokenizeHTML_plainText() {
 167         $this->assertTokenization(
 168             'This is regular text.',
 169             array(
 170                 new HTMLPurifier_Token_Text('This is regular text.')
 171             )
 172         );
 173     }
 174
 175     function test_tokenizeHTML_textAndTags() {
 176         $this->assertTokenization(
 177             'This is <b>bold</b> text',
 178             array(
 179                 new HTMLPurifier_Token_Text('This is '),
 180                 new HTMLPurifier_Token_Start('b', array()),
 181                 new HTMLPurifier_Token_Text('bold'),
 182                 new HTMLPurifier_Token_End('b'),
 183                 new HTMLPurifier_Token_Text(' text'),
 184             )
 185         );
 186     }
 187
 188     function test_tokenizeHTML_normalizeCase() {
 189         $this->assertTokenization(
 190             '<DIV>Totally rad dude. <b>asdf</b></div>',
 191             array(
 192                 new HTMLPurifier_Token_Start('DIV', array()),
 193                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 194                 new HTMLPurifier_Token_Start('b', array()),
 195                 new HTMLPurifier_Token_Text('asdf'),
 196                 new HTMLPurifier_Token_End('b'),
 197                 new HTMLPurifier_Token_End('div'),
 198             )
 199         );
 200     }
 201
 202     function test_tokenizeHTML_notWellFormed() {
 203         $this->assertTokenization(
 204             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 205             array(
 206                 new HTMLPurifier_Token_Start('asdf'),
 207                 new HTMLPurifier_Token_End('asdf'),
 208                 new HTMLPurifier_Token_Start('d'),
 209                 new HTMLPurifier_Token_End('d'),
 210                 new HTMLPurifier_Token_Start('poOloka'),
 211                 new HTMLPurifier_Token_Start('poolasdf'),
 212                 new HTMLPurifier_Token_Start('ds'),
 213                 new HTMLPurifier_Token_End('asdf'),
 214                 new HTMLPurifier_Token_End('ASDF'),
 215             ),
 216             array(
 217                 'DOMLex' => $alt = array(
 218                     new HTMLPurifier_Token_Empty('asdf'),
 219                     new HTMLPurifier_Token_Empty('d'),
 220                     new HTMLPurifier_Token_Start('pooloka'),
 221                     new HTMLPurifier_Token_Start('poolasdf'),
 222                     new HTMLPurifier_Token_Empty('ds'),
 223                     new HTMLPurifier_Token_End('poolasdf'),
 224                     new HTMLPurifier_Token_End('pooloka'),
 225                 ),
 226                 'PH5P' => $alt,
 227             )
 228         );
 229     }
 230
 231     function test_tokenizeHTML_whitespaceInTag() {
 232         $this->assertTokenization(
 233             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 234             array(
 235                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 236                 new HTMLPurifier_Token_Text('Link to '),
 237                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 238                 new HTMLPurifier_Token_Text('foobar'),
 239                 new HTMLPurifier_Token_End('b'),
 240                 new HTMLPurifier_Token_End('a'),
 241             )
 242         );
 243     }
 244
 245     function test_tokenizeHTML_emptyTag() {
 246         $this->assertTokenization(
 247             '<br />',
 248             array( new HTMLPurifier_Token_Empty('br') )
 249         );
 250     }
 251
 252     function test_tokenizeHTML_comment() {
 253         $this->assertTokenization(
 254             '<!-- Comment -->',
 255             array( new HTMLPurifier_Token_Comment(' Comment ') ),
 256             array(
 257                 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- Comment --') ),
 258             )
 259         );
 260     }
 261
 262     function test_tokenizeHTML_malformedComment() {
 263         $this->assertTokenization(
 264             '<!-- not so well formed --->',
 265             array( new HTMLPurifier_Token_Comment(' not so well formed -') ),
 266             array(
 267                 'PEARSax3' => array( new HTMLPurifier_Token_Comment('-- not so well formed ---') ),
 268             )
 269         );
 270     }
 271
 272     function test_tokenizeHTML_unterminatedTag() {
 273         $this->assertTokenization(
 274             '<a href=""',
 275             array( new HTMLPurifier_Token_Text('<a href=""') ),
 276             array(
 277                 // I like our behavior better, but it's non-standard
 278                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 279                 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
 280                 'PH5P' => false, // total barfing, grabs scaffolding too
 281             )
 282         );
 283     }
 284
 285     function test_tokenizeHTML_specialEntities() {
 286         $this->assertTokenization(
 287             '&lt;b&gt;',
 288             array(
 289                 new HTMLPurifier_Token_Text('<b>')
 290             ),
 291             array(
 292                 // some parsers will separate entities out
 293                 'PEARSax3' => $split = array(
 294                     new HTMLPurifier_Token_Text('<'),
 295                     new HTMLPurifier_Token_Text('b'),
 296                     new HTMLPurifier_Token_Text('>'),
 297                 ),
 298                 'PH5P' => $split,
 299             )
 300         );
 301     }
 302
 303     function test_tokenizeHTML_earlyQuote() {
 304         $this->assertTokenization(
 305             '<a "=>',
 306             array( new HTMLPurifier_Token_Empty('a') ),
 307             array(
 308                 // we barf on this input
 309                 'DirectLex' => $tokens = array(
 310                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 311                 ),
 312                 'PEARSax3' => $tokens,
 313                 'PH5P' => array(
 314                     new HTMLPurifier_Token_Empty('a', array('"' => ''))
 315                 ),
 316             )
 317         );
 318     }
 319
 320     function test_tokenizeHTML_unescapedQuote() {
 321         $this->assertTokenization(
 322             '"',
 323             array( new HTMLPurifier_Token_Text('"') )
 324         );
 325     }
 326
 327     function test_tokenizeHTML_escapedQuote() {
 328         $this->assertTokenization(
 329             '&quot;',
 330             array( new HTMLPurifier_Token_Text('"') ),
 331             array(
 332                 'PEARSax3' => false, // PEAR barfs on this
 333             )
 334         );
 335     }
 336
 337     function test_tokenizeHTML_cdata() {
 338         $this->assertTokenization(
 339             '<![CDATA[You <b>can&#39;t</b> get me!]]>',
 340             array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
 341             array(
 342                 // PEAR splits up all of the CDATA
 343                 'PEARSax3' => $split = array(
 344                     new HTMLPurifier_Token_Text('You '),
 345                     new HTMLPurifier_Token_Text('<'),
 346                     new HTMLPurifier_Token_Text('b'),
 347                     new HTMLPurifier_Token_Text('>'),
 348                     new HTMLPurifier_Token_Text('can'),
 349                     new HTMLPurifier_Token_Text('&'),
 350                     new HTMLPurifier_Token_Text('#39;t'),
 351                     new HTMLPurifier_Token_Text('<'),
 352                     new HTMLPurifier_Token_Text('/b'),
 353                     new HTMLPurifier_Token_Text('>'),
 354                     new HTMLPurifier_Token_Text(' get me!'),
 355                 ),
 356                 'PH5P' => $split,
 357             )
 358         );
 359     }
 360
 361     function test_tokenizeHTML_characterEntity() {
 362         $this->assertTokenization(
 363             '&theta;',
 364             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 365         );
 366     }
 367
 368     function test_tokenizeHTML_characterEntityInCDATA() {
 369         $this->assertTokenization(
 370             '<![CDATA[&rarr;]]>',
 371             array( new HTMLPurifier_Token_Text("&rarr;") ),
 372             array(
 373                 'PEARSax3' => $split = array(
 374                     new HTMLPurifier_Token_Text('&'),
 375                     new HTMLPurifier_Token_Text('rarr;'),
 376                 ),
 377                 'PH5P' => $split,
 378             )
 379         );
 380     }
 381
 382     function test_tokenizeHTML_entityInAttribute() {
 383         $this->assertTokenization(
 384             '<a href="index.php?title=foo&amp;id=bar">Link</a>',
 385             array(
 386                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 387                 new HTMLPurifier_Token_Text('Link'),
 388                 new HTMLPurifier_Token_End('a'),
 389             )
 390         );
 391     }
 392
 393     function test_tokenizeHTML_preserveUTF8() {
 394         $this->assertTokenization(
 395             "\xCE\xB8",
 396             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 397         );
 398     }
 399
 400     function test_tokenizeHTML_specialEntityInAttribute() {
 401         $this->assertTokenization(
 402             '<br test="x &lt; 6" />',
 403             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 404         );
 405     }
 406
 407     function test_tokenizeHTML_emoticonProtection() {
 408         $this->config->set('Core', 'AggressivelyFixLt', true);
 409         $this->assertTokenization(
 410             '<b>Whoa! <3 That\'s not good >.></b>',
 411             array(
 412                 new HTMLPurifier_Token_Start('b'),
 413                 new HTMLPurifier_Token_Text('Whoa! '),
 414                 new HTMLPurifier_Token_Text('<3 That\'s not good >'),
 415                 new HTMLPurifier_Token_Text('.>'),
 416                 new HTMLPurifier_Token_End('b')
 417             ),
 418             array(
 419                 // text is absorbed together
 420                 'DOMLex' => array(
 421                     new HTMLPurifier_Token_Start('b'),
 422                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 423                     new HTMLPurifier_Token_End('b'),
 424                 ),
 425                 'PEARSax3' => false, // totally mangled
 426                 'PH5P' => array( // interesting grouping
 427                     new HTMLPurifier_Token_Start('b'),
 428                     new HTMLPurifier_Token_Text('Whoa! '),
 429                     new HTMLPurifier_Token_Text('<'),
 430                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 431                     new HTMLPurifier_Token_End('b'),
 432                 ),
 433             )
 434         );
 435     }
 436
 437     function test_tokenizeHTML_commentWithFunkyChars() {
 438         $this->assertTokenization(
 439             '<!-- This >< comment --><br />',
 440             array(
 441                 new HTMLPurifier_Token_Comment(' This >< comment '),
 442                 new HTMLPurifier_Token_Empty('br'),
 443             ),
 444             array(
 445                 'PEARSax3' => false,
 446             )
 447         );
 448     }
 449
 450     function test_tokenizeHTML_unterminatedComment() {
 451         $this->assertTokenization(
 452             '<!-- This >< comment',
 453             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 454             array(
 455                 'DOMLex'   => false,
 456                 'PEARSax3' => false,
 457                 'PH5P'     => false,
 458             )
 459         );
 460     }
 461
 462     function test_tokenizeHTML_scriptCDATAContents() {
 463         $this->config->set('HTML', 'Trusted', true);
 464         $this->assertTokenization(
 465             'Foo: <script>alert("<foo>");</script>',
 466             array(
 467                 new HTMLPurifier_Token_Text('Foo: '),
 468                 new HTMLPurifier_Token_Start('script'),
 469                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 470                 new HTMLPurifier_Token_End('script'),
 471             ),
 472             array(
 473                 'PEARSax3' => false,
 474                 // PH5P, for some reason, bubbles the script to <head>
 475                 'PH5P' => false,
 476             )
 477         );
 478     }
 479
 480     function test_tokenizeHTML_entitiesInComment() {
 481         $this->config->set('Core', 'AggressivelyFixLt', true);
 482         $this->assertTokenization(
 483             '<!-- This comment < &lt; & -->',
 484             array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
 485             array(
 486                 'PEARSax3' => false
 487             )
 488         );
 489     }
 490
 491     function test_tokenizeHTML_attributeWithSpecialCharacters() {
 492         $this->assertTokenization(
 493             '<a href="><>">',
 494             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 495             array(
 496                 'DirectLex' => array(
 497                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 498                     new HTMLPurifier_Token_Text('<">'),
 499                 ),
 500                 'PEARSax3' => false,
 501             )
 502         );
 503     }
 504
 505     function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
 506         $this->assertTokenization(
 507             '<param name="src" value="http://example.com/video.wmv" />',
 508             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 509         );
 510     }
 511
 512     function test_tokenizeHTML_style() {
 513         $extra = array(
 514                 // PH5P doesn't seem to like style tags
 515                 'PH5P' => false,
 516                 // DirectLex defers to RemoveForeignElements for textification
 517                 'DirectLex' => array(
 518                     new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 519                     new HTMLPurifier_Token_Comment("\ndiv {}\n"),
 520                     new HTMLPurifier_Token_End('style'),
 521                 ),
 522             );
 523         if (!defined('LIBXML_VERSION') || LIBXML_VERSION < 20628) {
 524             // libxml's behavior is wrong prior to this version, so make
 525             // appropriate accomodations
 526             // :NOTE: LIBXML_VERSION is missing in early versions of PHP
 527             // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
 528             // this translates to 5.0.x. In such cases, we assume that an old
 529             // version of libxml is being used, although that *might* not
 530             // be the case (it's very unlikely though)
 531             $extra['DOMLex'] = $extra['DirectLex'];
 532         }
 533         $this->assertTokenization(
 534 '<style type="text/css"><!--
 535 div {}
 536 --></style>',
 537             array(
 538                 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 539                 new HTMLPurifier_Token_Text("\ndiv {}\n"),
 540                 new HTMLPurifier_Token_End('style'),
 541             ),
 542             $extra
 543         );
 544     }
 545
 546     function test_tokenizeHTML_() {
 547         $this->assertTokenization(
 548             '<a@>>',
 549             array(
 550                 new HTMLPurifier_Token_Start('a'),
 551                 new HTMLPurifier_Token_Text('>'),
 552                 new HTMLPurifier_Token_End('a'),
 553             ),
 554             array(
 555                 'DirectLex' => array(
 556                     // Technically this is invalid, but it won't be a
 557                     // problem with invalid element removal; also, this
 558                     // mimics Mozilla's parsing of the tag.
 559                     new HTMLPurifier_Token_Start('a@'),
 560                     new HTMLPurifier_Token_Text('>'),
 561                 ),
 562             )
 563         );
 564     }
 565
 566     /*
 567
 568     function test_tokenizeHTML_() {
 569         $this->assertTokenization(
 570             ,
 571             array(
 572
 573             )
 574         );
 575     }
 576     */
 577
 578 }
 579