tests/HTMLPurifier/LexerTest.php

   1 <?php
   2
   3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
   4 {
   5
   6     protected $_has_pear = false;
   7
   8     public function __construct() {
   9         parent::__construct();
  10         if ($GLOBALS['HTMLPurifierTest']['PEAR']) {
  11             require_once 'HTMLPurifier/Lexer/PEARSax3.php';
  12             $this->_has_pear = true;
  13         }
  14         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  15             require_once 'HTMLPurifier/Lexer/PH5P.php';
  16         }
  17     }
  18
  19     // HTMLPurifier_Lexer::create() --------------------------------------------
  20
  21     function test_create() {
  22         $this->config->set('Core.MaintainLineNumbers', true);
  23         $lexer = HTMLPurifier_Lexer::create($this->config);
  24         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  25     }
  26
  27     function test_create_objectLexerImpl() {
  28         $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  29         $lexer = HTMLPurifier_Lexer::create($this->config);
  30         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  31     }
  32
  33     function test_create_unknownLexer() {
  34         $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  35         $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  36         HTMLPurifier_Lexer::create($this->config);
  37     }
  38
  39     function test_create_incompatibleLexer() {
  40         $this->config->set('Core.LexerImpl', 'DOMLex');
  41         $this->config->set('Core.MaintainLineNumbers', true);
  42         $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  43         HTMLPurifier_Lexer::create($this->config);
  44     }
  45
  46     // HTMLPurifier_Lexer->parseData() -----------------------------------------
  47
  48     function assertParseData($input, $expect = true) {
  49         if ($expect === true) $expect = $input;
  50         $lexer = new HTMLPurifier_Lexer();
  51         $this->assertIdentical($expect, $lexer->parseData($input));
  52     }
  53
  54     function test_parseData_plainText() {
  55         $this->assertParseData('asdf');
  56     }
  57
  58     function test_parseData_ampersandEntity() {
  59         $this->assertParseData('&amp;', '&');
  60     }
  61
  62     function test_parseData_quotEntity() {
  63         $this->assertParseData('&quot;', '"');
  64     }
  65
  66     function test_parseData_aposNumericEntity() {
  67         $this->assertParseData('&#039;', "'");
  68     }
  69
  70     function test_parseData_aposCompactNumericEntity() {
  71         $this->assertParseData('&#39;', "'");
  72     }
  73
  74     function test_parseData_adjacentAmpersandEntities() {
  75         $this->assertParseData('&amp;&amp;&amp;', '&&&');
  76     }
  77
  78     function test_parseData_trailingUnescapedAmpersand() {
  79         $this->assertParseData('&amp;&', '&&');
  80     }
  81
  82     function test_parseData_internalUnescapedAmpersand() {
  83         $this->assertParseData('Procter & Gamble');
  84     }
  85
  86     function test_parseData_improperEntityFaultToleranceTest() {
  87         $this->assertParseData('&#x2D;');
  88     }
  89
  90     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  91
  92     function assertExtractBody($text, $extract = true) {
  93         $lexer = new HTMLPurifier_Lexer();
  94         $result = $lexer->extractBody($text);
  95         if ($extract === true) $extract = $text;
  96         $this->assertIdentical($extract, $result);
  97     }
  98
  99     function test_extractBody_noBodyTags() {
 100         $this->assertExtractBody('<b>Bold</b>');
 101     }
 102
 103     function test_extractBody_lowercaseBodyTags() {
 104         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
 105     }
 106
 107     function test_extractBody_uppercaseBodyTags() {
 108         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
 109     }
 110
 111     function test_extractBody_realisticUseCase() {
 112         $this->assertExtractBody(
 113 '<?xml version="1.0"
 114 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 115     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 116 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 117    <head>
 118       <title>xyz</title>
 119    </head>
 120    <body>
 121       <form method="post" action="whatever1">
 122          <div>
 123             <input type="text" name="username" />
 124             <input type="text" name="password" />
 125             <input type="submit" />
 126          </div>
 127       </form>
 128    </body>
 129 </html>',
 130     '
 131       <form method="post" action="whatever1">
 132          <div>
 133             <input type="text" name="username" />
 134             <input type="text" name="password" />
 135             <input type="submit" />
 136          </div>
 137       </form>
 138    ');
 139     }
 140
 141     function test_extractBody_bodyWithAttributes() {
 142         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 143     }
 144
 145     function test_extractBody_preserveUnclosedBody() {
 146         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 147     }
 148
 149     function test_extractBody_useLastBody() {
 150         $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
 151     }
 152
 153     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 154
 155     function assertTokenization($input, $expect, $alt_expect = array()) {
 156         $lexers = array();
 157         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 158         if ($this->_has_pear) $lexers['PEARSax3']   = new HTMLPurifier_Lexer_PEARSax3();
 159         if (class_exists('DOMDocument')) {
 160             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 161             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 162         }
 163         foreach ($lexers as $name => $lexer) {
 164             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 165             if (isset($alt_expect[$name])) {
 166                 if ($alt_expect[$name] === false) continue;
 167                 $t_expect = $alt_expect[$name];
 168                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 169             } else {
 170                 $t_expect = $expect;
 171                 $this->assertIdentical($result, $expect, "$name: %s");
 172             }
 173             if ($t_expect != $result) {
 174                 printTokens($result);
 175             }
 176         }
 177     }
 178
 179     function test_tokenizeHTML_emptyInput() {
 180         $this->assertTokenization('', array());
 181     }
 182
 183     function test_tokenizeHTML_plainText() {
 184         $this->assertTokenization(
 185             'This is regular text.',
 186             array(
 187                 new HTMLPurifier_Token_Text('This is regular text.')
 188             )
 189         );
 190     }
 191
 192     function test_tokenizeHTML_textAndTags() {
 193         $this->assertTokenization(
 194             'This is <b>bold</b> text',
 195             array(
 196                 new HTMLPurifier_Token_Text('This is '),
 197                 new HTMLPurifier_Token_Start('b', array()),
 198                 new HTMLPurifier_Token_Text('bold'),
 199                 new HTMLPurifier_Token_End('b'),
 200                 new HTMLPurifier_Token_Text(' text'),
 201             )
 202         );
 203     }
 204
 205     function test_tokenizeHTML_normalizeCase() {
 206         $this->assertTokenization(
 207             '<DIV>Totally rad dude. <b>asdf</b></div>',
 208             array(
 209                 new HTMLPurifier_Token_Start('DIV', array()),
 210                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 211                 new HTMLPurifier_Token_Start('b', array()),
 212                 new HTMLPurifier_Token_Text('asdf'),
 213                 new HTMLPurifier_Token_End('b'),
 214                 new HTMLPurifier_Token_End('div'),
 215             )
 216         );
 217     }
 218
 219     function test_tokenizeHTML_notWellFormed() {
 220         $this->assertTokenization(
 221             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 222             array(
 223                 new HTMLPurifier_Token_Start('asdf'),
 224                 new HTMLPurifier_Token_End('asdf'),
 225                 new HTMLPurifier_Token_Start('d'),
 226                 new HTMLPurifier_Token_End('d'),
 227                 new HTMLPurifier_Token_Start('poOloka'),
 228                 new HTMLPurifier_Token_Start('poolasdf'),
 229                 new HTMLPurifier_Token_Start('ds'),
 230                 new HTMLPurifier_Token_End('asdf'),
 231                 new HTMLPurifier_Token_End('ASDF'),
 232             ),
 233             array(
 234                 'DOMLex' => $alt = array(
 235                     new HTMLPurifier_Token_Empty('asdf'),
 236                     new HTMLPurifier_Token_Empty('d'),
 237                     new HTMLPurifier_Token_Start('pooloka'),
 238                     new HTMLPurifier_Token_Start('poolasdf'),
 239                     new HTMLPurifier_Token_Empty('ds'),
 240                     new HTMLPurifier_Token_End('poolasdf'),
 241                     new HTMLPurifier_Token_End('pooloka'),
 242                 ),
 243                 'PH5P' => $alt,
 244             )
 245         );
 246     }
 247
 248     function test_tokenizeHTML_whitespaceInTag() {
 249         $this->assertTokenization(
 250             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 251             array(
 252                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 253                 new HTMLPurifier_Token_Text('Link to '),
 254                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 255                 new HTMLPurifier_Token_Text('foobar'),
 256                 new HTMLPurifier_Token_End('b'),
 257                 new HTMLPurifier_Token_End('a'),
 258             )
 259         );
 260     }
 261
 262     function test_tokenizeHTML_emptyTag() {
 263         $this->assertTokenization(
 264             '<br />',
 265             array( new HTMLPurifier_Token_Empty('br') )
 266         );
 267     }
 268
 269     function test_tokenizeHTML_comment() {
 270         $this->assertTokenization(
 271             '<!-- Comment -->',
 272             array( new HTMLPurifier_Token_Comment(' Comment ') )
 273         );
 274     }
 275
 276     function test_tokenizeHTML_malformedComment() {
 277         $this->assertTokenization(
 278             '<!-- not so well formed --->',
 279             array( new HTMLPurifier_Token_Comment(' not so well formed -') )
 280         );
 281     }
 282
 283     function test_tokenizeHTML_unterminatedTag() {
 284         $this->assertTokenization(
 285             '<a href=""',
 286             array( new HTMLPurifier_Token_Text('<a href=""') ),
 287             array(
 288                 // I like our behavior better, but it's non-standard
 289                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 290                 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
 291                 'PH5P' => false, // total barfing, grabs scaffolding too
 292             )
 293         );
 294     }
 295
 296     function test_tokenizeHTML_specialEntities() {
 297         $this->assertTokenization(
 298             '&lt;b&gt;',
 299             array(
 300                 new HTMLPurifier_Token_Text('<b>')
 301             ),
 302             array(
 303                 // some parsers will separate entities out
 304                 'PEARSax3' => $split = array(
 305                     new HTMLPurifier_Token_Text('<'),
 306                     new HTMLPurifier_Token_Text('b'),
 307                     new HTMLPurifier_Token_Text('>'),
 308                 ),
 309                 'PH5P' => $split,
 310             )
 311         );
 312     }
 313
 314     function test_tokenizeHTML_earlyQuote() {
 315         $this->assertTokenization(
 316             '<a "=>',
 317             array( new HTMLPurifier_Token_Empty('a') ),
 318             array(
 319                 // we barf on this input
 320                 'DirectLex' => $tokens = array(
 321                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 322                 ),
 323                 'PEARSax3' => $tokens,
 324                 'PH5P' => false, // behavior varies; handle this personally
 325             )
 326         );
 327     }
 328
 329     function test_tokenizeHTML_earlyQuote_PH5P() {
 330         if (!class_exists('DOMDocument')) return;
 331         $lexer = new HTMLPurifier_Lexer_PH5P();
 332         $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
 333         if ($this->context->get('PH5PError', true)) {
 334             $this->assertIdentical(array(
 335                 new HTMLPurifier_Token_Start('a', array('"' => ''))
 336             ), $result);
 337         } else {
 338             $this->assertIdentical(array(
 339                 new HTMLPurifier_Token_Empty('a', array('"' => ''))
 340             ), $result);
 341         }
 342     }
 343
 344     function test_tokenizeHTML_unescapedQuote() {
 345         $this->assertTokenization(
 346             '"',
 347             array( new HTMLPurifier_Token_Text('"') )
 348         );
 349     }
 350
 351     function test_tokenizeHTML_escapedQuote() {
 352         $this->assertTokenization(
 353             '&quot;',
 354             array( new HTMLPurifier_Token_Text('"') ),
 355             array(
 356                 'PEARSax3' => false, // PEAR barfs on this
 357             )
 358         );
 359     }
 360
 361     function test_tokenizeHTML_cdata() {
 362         $this->assertTokenization(
 363             '<![CDATA[You <b>can&#39;t</b> get me!]]>',
 364             array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
 365             array(
 366                 // PEAR splits up all of the CDATA
 367                 'PEARSax3' => $split = array(
 368                     new HTMLPurifier_Token_Text('You '),
 369                     new HTMLPurifier_Token_Text('<'),
 370                     new HTMLPurifier_Token_Text('b'),
 371                     new HTMLPurifier_Token_Text('>'),
 372                     new HTMLPurifier_Token_Text('can'),
 373                     new HTMLPurifier_Token_Text('&'),
 374                     new HTMLPurifier_Token_Text('#39;t'),
 375                     new HTMLPurifier_Token_Text('<'),
 376                     new HTMLPurifier_Token_Text('/b'),
 377                     new HTMLPurifier_Token_Text('>'),
 378                     new HTMLPurifier_Token_Text(' get me!'),
 379                 ),
 380                 'PH5P' => $split,
 381             )
 382         );
 383     }
 384
 385     function test_tokenizeHTML_characterEntity() {
 386         $this->assertTokenization(
 387             '&theta;',
 388             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 389         );
 390     }
 391
 392     function test_tokenizeHTML_characterEntityInCDATA() {
 393         $this->assertTokenization(
 394             '<![CDATA[&rarr;]]>',
 395             array( new HTMLPurifier_Token_Text("&rarr;") ),
 396             array(
 397                 'PEARSax3' => $split = array(
 398                     new HTMLPurifier_Token_Text('&'),
 399                     new HTMLPurifier_Token_Text('rarr;'),
 400                 ),
 401                 'PH5P' => $split,
 402             )
 403         );
 404     }
 405
 406     function test_tokenizeHTML_entityInAttribute() {
 407         $this->assertTokenization(
 408             '<a href="index.php?title=foo&amp;id=bar">Link</a>',
 409             array(
 410                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 411                 new HTMLPurifier_Token_Text('Link'),
 412                 new HTMLPurifier_Token_End('a'),
 413             )
 414         );
 415     }
 416
 417     function test_tokenizeHTML_preserveUTF8() {
 418         $this->assertTokenization(
 419             "\xCE\xB8",
 420             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 421         );
 422     }
 423
 424     function test_tokenizeHTML_specialEntityInAttribute() {
 425         $this->assertTokenization(
 426             '<br test="x &lt; 6" />',
 427             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 428         );
 429     }
 430
 431     function test_tokenizeHTML_emoticonProtection() {
 432         $this->assertTokenization(
 433             '<b>Whoa! <3 That\'s not good >.></b>',
 434             array(
 435                 new HTMLPurifier_Token_Start('b'),
 436                 new HTMLPurifier_Token_Text('Whoa! '),
 437                 new HTMLPurifier_Token_Text('<'),
 438                 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 439                 new HTMLPurifier_Token_End('b')
 440             ),
 441             array(
 442                 // text is absorbed together
 443                 'DOMLex' => array(
 444                     new HTMLPurifier_Token_Start('b'),
 445                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 446                     new HTMLPurifier_Token_End('b'),
 447                 ),
 448                 'PEARSax3' => false, // totally mangled
 449                 'PH5P' => array( // interesting grouping
 450                     new HTMLPurifier_Token_Start('b'),
 451                     new HTMLPurifier_Token_Text('Whoa! '),
 452                     new HTMLPurifier_Token_Text('<'),
 453                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 454                     new HTMLPurifier_Token_End('b'),
 455                 ),
 456             )
 457         );
 458     }
 459
 460     function test_tokenizeHTML_commentWithFunkyChars() {
 461         $this->assertTokenization(
 462             '<!-- This >< comment --><br />',
 463             array(
 464                 new HTMLPurifier_Token_Comment(' This >< comment '),
 465                 new HTMLPurifier_Token_Empty('br'),
 466             ),
 467             array(
 468                 'PEARSax3' => false,
 469             )
 470         );
 471     }
 472
 473     function test_tokenizeHTML_unterminatedComment() {
 474         $this->assertTokenization(
 475             '<!-- This >< comment',
 476             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 477             array(
 478                 'DOMLex'   => false,
 479                 'PEARSax3' => false,
 480                 'PH5P'     => false,
 481             )
 482         );
 483     }
 484
 485     function test_tokenizeHTML_scriptCDATAContents() {
 486         $this->config->set('HTML.Trusted', true);
 487         $this->assertTokenization(
 488             'Foo: <script>alert("<foo>");</script>',
 489             array(
 490                 new HTMLPurifier_Token_Text('Foo: '),
 491                 new HTMLPurifier_Token_Start('script'),
 492                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 493                 new HTMLPurifier_Token_End('script'),
 494             ),
 495             array(
 496                 'PEARSax3' => false,
 497                 // PH5P, for some reason, bubbles the script to <head>
 498                 'PH5P' => false,
 499             )
 500         );
 501     }
 502
 503     function test_tokenizeHTML_entitiesInComment() {
 504         $this->assertTokenization(
 505             '<!-- This comment < &lt; & -->',
 506             array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
 507             array(
 508                 'PEARSax3' => false
 509             )
 510         );
 511     }
 512
 513     function test_tokenizeHTML_attributeWithSpecialCharacters() {
 514         $this->assertTokenization(
 515             '<a href="><>">',
 516             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 517             array(
 518                 'DirectLex' => array(
 519                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 520                     new HTMLPurifier_Token_Text('<'),
 521                     new HTMLPurifier_Token_Text('">'),
 522                 ),
 523                 'PEARSax3' => false,
 524             )
 525         );
 526     }
 527
 528     function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
 529         $this->assertTokenization(
 530             '<param name="src" value="http://example.com/video.wmv" />',
 531             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 532         );
 533     }
 534
 535     function test_tokenizeHTML_style() {
 536         $extra = array(
 537                 // PH5P doesn't seem to like style tags
 538                 'PH5P' => false,
 539                 // DirectLex defers to RemoveForeignElements for textification
 540                 'DirectLex' => array(
 541                     new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 542                     new HTMLPurifier_Token_Comment("\ndiv {}\n"),
 543                     new HTMLPurifier_Token_End('style'),
 544                 ),
 545             );
 546         if (!defined('LIBXML_VERSION')) {
 547             // LIBXML_VERSION is missing in early versions of PHP
 548             // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
 549             // this translates to 5.0.x. In such cases, punt the test entirely.
 550             return;
 551         } elseif (LIBXML_VERSION < 20628) {
 552             // libxml's behavior is wrong prior to this version, so make
 553             // appropriate accomodations
 554             $extra['DOMLex'] = $extra['DirectLex'];
 555         }
 556         $this->assertTokenization(
 557 '<style type="text/css"><!--
 558 div {}
 559 --></style>',
 560             array(
 561                 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 562                 new HTMLPurifier_Token_Text("\ndiv {}\n"),
 563                 new HTMLPurifier_Token_End('style'),
 564             ),
 565             $extra
 566         );
 567     }
 568
 569     function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
 570         $alt_expect = array(
 571             // Technically this is invalid, but it won't be a
 572             // problem with invalid element removal; also, this
 573             // mimics Mozilla's parsing of the tag.
 574             new HTMLPurifier_Token_Start('a@'),
 575             new HTMLPurifier_Token_Text('>'),
 576         );
 577         $this->assertTokenization(
 578             '<a@>>',
 579             array(
 580                 new HTMLPurifier_Token_Start('a'),
 581                 new HTMLPurifier_Token_Text('>'),
 582                 new HTMLPurifier_Token_End('a'),
 583             ),
 584             array(
 585                 'DirectLex' => $alt_expect,
 586                 'PEARSax3' => $alt_expect,
 587             )
 588         );
 589     }
 590
 591     function test_tokenizeHTML_emoticonHeart() {
 592         $this->assertTokenization(
 593             '<br /><3<br />',
 594             array(
 595                 new HTMLPurifier_Token_Empty('br'),
 596                 new HTMLPurifier_Token_Text('<'),
 597                 new HTMLPurifier_Token_Text('3'),
 598                 new HTMLPurifier_Token_Empty('br'),
 599             ),
 600             array(
 601                 'DOMLex' => array(
 602                     new HTMLPurifier_Token_Empty('br'),
 603                     new HTMLPurifier_Token_Text('<3'),
 604                     new HTMLPurifier_Token_Empty('br'),
 605                 ),
 606                 'PEARSax3' => array(
 607                     // bah too lazy to fix this
 608                     new HTMLPurifier_Token_Empty('br'),
 609                     new HTMLPurifier_Token_Empty('3<br'),
 610                 ),
 611             )
 612         );
 613     }
 614
 615     function test_tokenizeHTML_emoticonShiftyEyes() {
 616         $this->assertTokenization(
 617             '<b><<</b>',
 618             array(
 619                 new HTMLPurifier_Token_Start('b'),
 620                 new HTMLPurifier_Token_Text('<'),
 621                 new HTMLPurifier_Token_Text('<'),
 622                 new HTMLPurifier_Token_End('b'),
 623             ),
 624             array(
 625                 'DOMLex' => array(
 626                     new HTMLPurifier_Token_Start('b'),
 627                     new HTMLPurifier_Token_Text('<<'),
 628                     new HTMLPurifier_Token_End('b'),
 629                 ),
 630                 'PEARSax3' => array(
 631                     // also too lazy to fix
 632                     new HTMLPurifier_Token_Start('b'),
 633                     new HTMLPurifier_Token_Empty('<<'),
 634                     new HTMLPurifier_Token_Text('b>'),
 635                 ),
 636             )
 637         );
 638     }
 639
 640     function test_tokenizeHTML_eon1996() {
 641         $this->assertTokenization(
 642             '< <b>test</b>',
 643             array(
 644                 new HTMLPurifier_Token_Text('<'),
 645                 new HTMLPurifier_Token_Text(' '),
 646                 new HTMLPurifier_Token_Start('b'),
 647                 new HTMLPurifier_Token_Text('test'),
 648                 new HTMLPurifier_Token_End('b'),
 649             ),
 650             array(
 651                 'DOMLex' => array(
 652                     new HTMLPurifier_Token_Text('< '),
 653                     new HTMLPurifier_Token_Start('b'),
 654                     new HTMLPurifier_Token_Text('test'),
 655                     new HTMLPurifier_Token_End('b'),
 656                 ),
 657                 'PEARSax3' => array(
 658                     // totally doing the wrong thing here
 659                     new HTMLPurifier_Token_Text(' '),
 660                     new HTMLPurifier_Token_Start('b'),
 661                     new HTMLPurifier_Token_Text('test'),
 662                     new HTMLPurifier_Token_End('b'),
 663                 ),
 664             )
 665         );
 666     }
 667
 668     function test_tokenizeHTML_bodyInCDATA() {
 669         $alt_tokens = array(
 670             new HTMLPurifier_Token_Text('<'),
 671             new HTMLPurifier_Token_Text('body'),
 672             new HTMLPurifier_Token_Text('>'),
 673             new HTMLPurifier_Token_Text('Foo'),
 674             new HTMLPurifier_Token_Text('<'),
 675             new HTMLPurifier_Token_Text('/body'),
 676             new HTMLPurifier_Token_Text('>'),
 677         );
 678         $this->assertTokenization(
 679             '<![CDATA[<body>Foo</body>]]>',
 680             array(
 681                 new HTMLPurifier_Token_Text('<body>Foo</body>'),
 682             ),
 683             array(
 684                 'PH5P' => $alt_tokens,
 685                 'PEARSax3' => $alt_tokens,
 686             )
 687         );
 688     }
 689
 690     function test_tokenizeHTML_() {
 691         $this->assertTokenization(
 692             '<a><img /></a>',
 693             array(
 694                 new HTMLPurifier_Token_Start('a'),
 695                 new HTMLPurifier_Token_Empty('img'),
 696                 new HTMLPurifier_Token_End('a'),
 697             )
 698         );
 699     }
 700
 701     /*
 702
 703     function test_tokenizeHTML_() {
 704         $this->assertTokenization(
 705             ,
 706             array(
 707
 708             )
 709         );
 710     }
 711     */
 712
 713 }
 714
 715 // vim: et sw=4 sts=4