tests/HTMLPurifier/LexerTest.php

   1 <?php
   2
   3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
   4 {
   5
   6     protected $_has_pear = false;
   7
   8     public function __construct() {
   9         parent::__construct();
  10         if ($GLOBALS['HTMLPurifierTest']['PEAR'] &&
  11         // PEARSax3 is not maintained and throws loads of DEPRECATED
  12         // errors in PHP 5.3
  13         version_compare(PHP_VERSION, '5.3', '<')) {
  14             require_once 'HTMLPurifier/Lexer/PEARSax3.php';
  15             $this->_has_pear = true;
  16         }
  17         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  18             require_once 'HTMLPurifier/Lexer/PH5P.php';
  19         }
  20     }
  21
  22     // HTMLPurifier_Lexer::create() --------------------------------------------
  23
  24     function test_create() {
  25         $this->config->set('Core.MaintainLineNumbers', true);
  26         $lexer = HTMLPurifier_Lexer::create($this->config);
  27         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  28     }
  29
  30     function test_create_objectLexerImpl() {
  31         $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  32         $lexer = HTMLPurifier_Lexer::create($this->config);
  33         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  34     }
  35
  36     function test_create_unknownLexer() {
  37         $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  38         $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  39         HTMLPurifier_Lexer::create($this->config);
  40     }
  41
  42     function test_create_incompatibleLexer() {
  43         $this->config->set('Core.LexerImpl', 'DOMLex');
  44         $this->config->set('Core.MaintainLineNumbers', true);
  45         $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  46         HTMLPurifier_Lexer::create($this->config);
  47     }
  48
  49     // HTMLPurifier_Lexer->parseData() -----------------------------------------
  50
  51     function assertParseData($input, $expect = true) {
  52         if ($expect === true) $expect = $input;
  53         $lexer = new HTMLPurifier_Lexer();
  54         $this->assertIdentical($expect, $lexer->parseData($input));
  55     }
  56
  57     function test_parseData_plainText() {
  58         $this->assertParseData('asdf');
  59     }
  60
  61     function test_parseData_ampersandEntity() {
  62         $this->assertParseData('&amp;', '&');
  63     }
  64
  65     function test_parseData_quotEntity() {
  66         $this->assertParseData('&quot;', '"');
  67     }
  68
  69     function test_parseData_aposNumericEntity() {
  70         $this->assertParseData('&#039;', "'");
  71     }
  72
  73     function test_parseData_aposCompactNumericEntity() {
  74         $this->assertParseData('&#39;', "'");
  75     }
  76
  77     function test_parseData_adjacentAmpersandEntities() {
  78         $this->assertParseData('&amp;&amp;&amp;', '&&&');
  79     }
  80
  81     function test_parseData_trailingUnescapedAmpersand() {
  82         $this->assertParseData('&amp;&', '&&');
  83     }
  84
  85     function test_parseData_internalUnescapedAmpersand() {
  86         $this->assertParseData('Procter & Gamble');
  87     }
  88
  89     function test_parseData_improperEntityFaultToleranceTest() {
  90         $this->assertParseData('&#x2D;');
  91     }
  92
  93     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  94
  95     function assertExtractBody($text, $extract = true) {
  96         $lexer = new HTMLPurifier_Lexer();
  97         $result = $lexer->extractBody($text);
  98         if ($extract === true) $extract = $text;
  99         $this->assertIdentical($extract, $result);
 100     }
 101
 102     function test_extractBody_noBodyTags() {
 103         $this->assertExtractBody('<b>Bold</b>');
 104     }
 105
 106     function test_extractBody_lowercaseBodyTags() {
 107         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
 108     }
 109
 110     function test_extractBody_uppercaseBodyTags() {
 111         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
 112     }
 113
 114     function test_extractBody_realisticUseCase() {
 115         $this->assertExtractBody(
 116 '<?xml version="1.0"
 117 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 118     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 119 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 120    <head>
 121       <title>xyz</title>
 122    </head>
 123    <body>
 124       <form method="post" action="whatever1">
 125          <div>
 126             <input type="text" name="username" />
 127             <input type="text" name="password" />
 128             <input type="submit" />
 129          </div>
 130       </form>
 131    </body>
 132 </html>',
 133     '
 134       <form method="post" action="whatever1">
 135          <div>
 136             <input type="text" name="username" />
 137             <input type="text" name="password" />
 138             <input type="submit" />
 139          </div>
 140       </form>
 141    ');
 142     }
 143
 144     function test_extractBody_bodyWithAttributes() {
 145         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 146     }
 147
 148     function test_extractBody_preserveUnclosedBody() {
 149         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 150     }
 151
 152     function test_extractBody_useLastBody() {
 153         $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
 154     }
 155
 156     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 157
 158     function assertTokenization($input, $expect, $alt_expect = array()) {
 159         $lexers = array();
 160         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 161         if ($this->_has_pear) $lexers['PEARSax3']   = new HTMLPurifier_Lexer_PEARSax3();
 162         if (class_exists('DOMDocument')) {
 163             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 164             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 165         }
 166         foreach ($lexers as $name => $lexer) {
 167             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 168             if (isset($alt_expect[$name])) {
 169                 if ($alt_expect[$name] === false) continue;
 170                 $t_expect = $alt_expect[$name];
 171                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 172             } else {
 173                 $t_expect = $expect;
 174                 $this->assertIdentical($result, $expect, "$name: %s");
 175             }
 176             if ($t_expect != $result) {
 177                 printTokens($result);
 178             }
 179         }
 180     }
 181
 182     function test_tokenizeHTML_emptyInput() {
 183         $this->assertTokenization('', array());
 184     }
 185
 186     function test_tokenizeHTML_plainText() {
 187         $this->assertTokenization(
 188             'This is regular text.',
 189             array(
 190                 new HTMLPurifier_Token_Text('This is regular text.')
 191             )
 192         );
 193     }
 194
 195     function test_tokenizeHTML_textAndTags() {
 196         $this->assertTokenization(
 197             'This is <b>bold</b> text',
 198             array(
 199                 new HTMLPurifier_Token_Text('This is '),
 200                 new HTMLPurifier_Token_Start('b', array()),
 201                 new HTMLPurifier_Token_Text('bold'),
 202                 new HTMLPurifier_Token_End('b'),
 203                 new HTMLPurifier_Token_Text(' text'),
 204             )
 205         );
 206     }
 207
 208     function test_tokenizeHTML_normalizeCase() {
 209         $this->assertTokenization(
 210             '<DIV>Totally rad dude. <b>asdf</b></div>',
 211             array(
 212                 new HTMLPurifier_Token_Start('DIV', array()),
 213                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 214                 new HTMLPurifier_Token_Start('b', array()),
 215                 new HTMLPurifier_Token_Text('asdf'),
 216                 new HTMLPurifier_Token_End('b'),
 217                 new HTMLPurifier_Token_End('div'),
 218             )
 219         );
 220     }
 221
 222     function test_tokenizeHTML_notWellFormed() {
 223         $this->assertTokenization(
 224             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 225             array(
 226                 new HTMLPurifier_Token_Start('asdf'),
 227                 new HTMLPurifier_Token_End('asdf'),
 228                 new HTMLPurifier_Token_Start('d'),
 229                 new HTMLPurifier_Token_End('d'),
 230                 new HTMLPurifier_Token_Start('poOloka'),
 231                 new HTMLPurifier_Token_Start('poolasdf'),
 232                 new HTMLPurifier_Token_Start('ds'),
 233                 new HTMLPurifier_Token_End('asdf'),
 234                 new HTMLPurifier_Token_End('ASDF'),
 235             ),
 236             array(
 237                 'DOMLex' => $alt = array(
 238                     new HTMLPurifier_Token_Empty('asdf'),
 239                     new HTMLPurifier_Token_Empty('d'),
 240                     new HTMLPurifier_Token_Start('pooloka'),
 241                     new HTMLPurifier_Token_Start('poolasdf'),
 242                     new HTMLPurifier_Token_Empty('ds'),
 243                     new HTMLPurifier_Token_End('poolasdf'),
 244                     new HTMLPurifier_Token_End('pooloka'),
 245                 ),
 246                 'PH5P' => $alt,
 247             )
 248         );
 249     }
 250
 251     function test_tokenizeHTML_whitespaceInTag() {
 252         $this->assertTokenization(
 253             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 254             array(
 255                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 256                 new HTMLPurifier_Token_Text('Link to '),
 257                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 258                 new HTMLPurifier_Token_Text('foobar'),
 259                 new HTMLPurifier_Token_End('b'),
 260                 new HTMLPurifier_Token_End('a'),
 261             )
 262         );
 263     }
 264
 265     function test_tokenizeHTML_emptyTag() {
 266         $this->assertTokenization(
 267             '<br />',
 268             array( new HTMLPurifier_Token_Empty('br') )
 269         );
 270     }
 271
 272     function test_tokenizeHTML_comment() {
 273         $this->assertTokenization(
 274             '<!-- Comment -->',
 275             array( new HTMLPurifier_Token_Comment(' Comment ') )
 276         );
 277     }
 278
 279     function test_tokenizeHTML_malformedComment() {
 280         $this->assertTokenization(
 281             '<!-- not so well formed --->',
 282             array( new HTMLPurifier_Token_Comment(' not so well formed -') )
 283         );
 284     }
 285
 286     function test_tokenizeHTML_unterminatedTag() {
 287         $this->assertTokenization(
 288             '<a href=""',
 289             array( new HTMLPurifier_Token_Text('<a href=""') ),
 290             array(
 291                 // I like our behavior better, but it's non-standard
 292                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 293                 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
 294                 'PH5P' => false, // total barfing, grabs scaffolding too
 295             )
 296         );
 297     }
 298
 299     function test_tokenizeHTML_specialEntities() {
 300         $this->assertTokenization(
 301             '&lt;b&gt;',
 302             array(
 303                 new HTMLPurifier_Token_Text('<b>')
 304             ),
 305             array(
 306                 // some parsers will separate entities out
 307                 'PEARSax3' => $split = array(
 308                     new HTMLPurifier_Token_Text('<'),
 309                     new HTMLPurifier_Token_Text('b'),
 310                     new HTMLPurifier_Token_Text('>'),
 311                 ),
 312                 'PH5P' => $split,
 313             )
 314         );
 315     }
 316
 317     function test_tokenizeHTML_earlyQuote() {
 318         $this->assertTokenization(
 319             '<a "=>',
 320             array( new HTMLPurifier_Token_Empty('a') ),
 321             array(
 322                 // we barf on this input
 323                 'DirectLex' => $tokens = array(
 324                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 325                 ),
 326                 'PEARSax3' => $tokens,
 327                 'PH5P' => false, // behavior varies; handle this personally
 328             )
 329         );
 330     }
 331
 332     function test_tokenizeHTML_earlyQuote_PH5P() {
 333         if (!class_exists('DOMDocument')) return;
 334         $lexer = new HTMLPurifier_Lexer_PH5P();
 335         $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
 336         if ($this->context->get('PH5PError', true)) {
 337             $this->assertIdentical(array(
 338                 new HTMLPurifier_Token_Start('a', array('"' => ''))
 339             ), $result);
 340         } else {
 341             $this->assertIdentical(array(
 342                 new HTMLPurifier_Token_Empty('a', array('"' => ''))
 343             ), $result);
 344         }
 345     }
 346
 347     function test_tokenizeHTML_unescapedQuote() {
 348         $this->assertTokenization(
 349             '"',
 350             array( new HTMLPurifier_Token_Text('"') )
 351         );
 352     }
 353
 354     function test_tokenizeHTML_escapedQuote() {
 355         $this->assertTokenization(
 356             '&quot;',
 357             array( new HTMLPurifier_Token_Text('"') ),
 358             array(
 359                 'PEARSax3' => false, // PEAR barfs on this
 360             )
 361         );
 362     }
 363
 364     function test_tokenizeHTML_cdata() {
 365         $this->assertTokenization(
 366             '<![CDATA[You <b>can&#39;t</b> get me!]]>',
 367             array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
 368             array(
 369                 // PEAR splits up all of the CDATA
 370                 'PEARSax3' => $split = array(
 371                     new HTMLPurifier_Token_Text('You '),
 372                     new HTMLPurifier_Token_Text('<'),
 373                     new HTMLPurifier_Token_Text('b'),
 374                     new HTMLPurifier_Token_Text('>'),
 375                     new HTMLPurifier_Token_Text('can'),
 376                     new HTMLPurifier_Token_Text('&'),
 377                     new HTMLPurifier_Token_Text('#39;t'),
 378                     new HTMLPurifier_Token_Text('<'),
 379                     new HTMLPurifier_Token_Text('/b'),
 380                     new HTMLPurifier_Token_Text('>'),
 381                     new HTMLPurifier_Token_Text(' get me!'),
 382                 ),
 383                 'PH5P' => $split,
 384             )
 385         );
 386     }
 387
 388     function test_tokenizeHTML_characterEntity() {
 389         $this->assertTokenization(
 390             '&theta;',
 391             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 392         );
 393     }
 394
 395     function test_tokenizeHTML_characterEntityInCDATA() {
 396         $this->assertTokenization(
 397             '<![CDATA[&rarr;]]>',
 398             array( new HTMLPurifier_Token_Text("&rarr;") ),
 399             array(
 400                 'PEARSax3' => $split = array(
 401                     new HTMLPurifier_Token_Text('&'),
 402                     new HTMLPurifier_Token_Text('rarr;'),
 403                 ),
 404                 'PH5P' => $split,
 405             )
 406         );
 407     }
 408
 409     function test_tokenizeHTML_entityInAttribute() {
 410         $this->assertTokenization(
 411             '<a href="index.php?title=foo&amp;id=bar">Link</a>',
 412             array(
 413                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 414                 new HTMLPurifier_Token_Text('Link'),
 415                 new HTMLPurifier_Token_End('a'),
 416             )
 417         );
 418     }
 419
 420     function test_tokenizeHTML_preserveUTF8() {
 421         $this->assertTokenization(
 422             "\xCE\xB8",
 423             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 424         );
 425     }
 426
 427     function test_tokenizeHTML_specialEntityInAttribute() {
 428         $this->assertTokenization(
 429             '<br test="x &lt; 6" />',
 430             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 431         );
 432     }
 433
 434     function test_tokenizeHTML_emoticonProtection() {
 435         $this->assertTokenization(
 436             '<b>Whoa! <3 That\'s not good >.></b>',
 437             array(
 438                 new HTMLPurifier_Token_Start('b'),
 439                 new HTMLPurifier_Token_Text('Whoa! '),
 440                 new HTMLPurifier_Token_Text('<'),
 441                 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 442                 new HTMLPurifier_Token_End('b')
 443             ),
 444             array(
 445                 // text is absorbed together
 446                 'DOMLex' => array(
 447                     new HTMLPurifier_Token_Start('b'),
 448                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 449                     new HTMLPurifier_Token_End('b'),
 450                 ),
 451                 'PEARSax3' => false, // totally mangled
 452                 'PH5P' => array( // interesting grouping
 453                     new HTMLPurifier_Token_Start('b'),
 454                     new HTMLPurifier_Token_Text('Whoa! '),
 455                     new HTMLPurifier_Token_Text('<'),
 456                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 457                     new HTMLPurifier_Token_End('b'),
 458                 ),
 459             )
 460         );
 461     }
 462
 463     function test_tokenizeHTML_commentWithFunkyChars() {
 464         $this->assertTokenization(
 465             '<!-- This >< comment --><br />',
 466             array(
 467                 new HTMLPurifier_Token_Comment(' This >< comment '),
 468                 new HTMLPurifier_Token_Empty('br'),
 469             ),
 470             array(
 471                 'PEARSax3' => false,
 472             )
 473         );
 474     }
 475
 476     function test_tokenizeHTML_unterminatedComment() {
 477         $this->assertTokenization(
 478             '<!-- This >< comment',
 479             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 480             array(
 481                 'DOMLex'   => false,
 482                 'PEARSax3' => false,
 483                 'PH5P'     => false,
 484             )
 485         );
 486     }
 487
 488     function test_tokenizeHTML_scriptCDATAContents() {
 489         $this->config->set('HTML.Trusted', true);
 490         $this->assertTokenization(
 491             'Foo: <script>alert("<foo>");</script>',
 492             array(
 493                 new HTMLPurifier_Token_Text('Foo: '),
 494                 new HTMLPurifier_Token_Start('script'),
 495                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 496                 new HTMLPurifier_Token_End('script'),
 497             ),
 498             array(
 499                 'PEARSax3' => false,
 500                 // PH5P, for some reason, bubbles the script to <head>
 501                 'PH5P' => false,
 502             )
 503         );
 504     }
 505
 506     function test_tokenizeHTML_entitiesInComment() {
 507         $this->assertTokenization(
 508             '<!-- This comment < &lt; & -->',
 509             array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
 510             array(
 511                 'PEARSax3' => false
 512             )
 513         );
 514     }
 515
 516     function test_tokenizeHTML_attributeWithSpecialCharacters() {
 517         $this->assertTokenization(
 518             '<a href="><>">',
 519             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 520             array(
 521                 'DirectLex' => array(
 522                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 523                     new HTMLPurifier_Token_Text('<'),
 524                     new HTMLPurifier_Token_Text('">'),
 525                 ),
 526                 'PEARSax3' => false,
 527             )
 528         );
 529     }
 530
 531     function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
 532         $this->assertTokenization(
 533             '<param name="src" value="http://example.com/video.wmv" />',
 534             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 535         );
 536     }
 537
 538     function test_tokenizeHTML_style() {
 539         $extra = array(
 540                 // PH5P doesn't seem to like style tags
 541                 'PH5P' => false,
 542                 // DirectLex defers to RemoveForeignElements for textification
 543                 'DirectLex' => array(
 544                     new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 545                     new HTMLPurifier_Token_Comment("\ndiv {}\n"),
 546                     new HTMLPurifier_Token_End('style'),
 547                 ),
 548             );
 549         if (!defined('LIBXML_VERSION')) {
 550             // LIBXML_VERSION is missing in early versions of PHP
 551             // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
 552             // this translates to 5.0.x. In such cases, punt the test entirely.
 553             return;
 554         } elseif (LIBXML_VERSION < 20628) {
 555             // libxml's behavior is wrong prior to this version, so make
 556             // appropriate accomodations
 557             $extra['DOMLex'] = $extra['DirectLex'];
 558         }
 559         $this->assertTokenization(
 560 '<style type="text/css"><!--
 561 div {}
 562 --></style>',
 563             array(
 564                 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 565                 new HTMLPurifier_Token_Text("\ndiv {}\n"),
 566                 new HTMLPurifier_Token_End('style'),
 567             ),
 568             $extra
 569         );
 570     }
 571
 572     function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
 573         $alt_expect = array(
 574             // Technically this is invalid, but it won't be a
 575             // problem with invalid element removal; also, this
 576             // mimics Mozilla's parsing of the tag.
 577             new HTMLPurifier_Token_Start('a@'),
 578             new HTMLPurifier_Token_Text('>'),
 579         );
 580         $this->assertTokenization(
 581             '<a@>>',
 582             array(
 583                 new HTMLPurifier_Token_Start('a'),
 584                 new HTMLPurifier_Token_Text('>'),
 585                 new HTMLPurifier_Token_End('a'),
 586             ),
 587             array(
 588                 'DirectLex' => $alt_expect,
 589                 'PEARSax3' => $alt_expect,
 590             )
 591         );
 592     }
 593
 594     function test_tokenizeHTML_emoticonHeart() {
 595         $this->assertTokenization(
 596             '<br /><3<br />',
 597             array(
 598                 new HTMLPurifier_Token_Empty('br'),
 599                 new HTMLPurifier_Token_Text('<'),
 600                 new HTMLPurifier_Token_Text('3'),
 601                 new HTMLPurifier_Token_Empty('br'),
 602             ),
 603             array(
 604                 'DOMLex' => array(
 605                     new HTMLPurifier_Token_Empty('br'),
 606                     new HTMLPurifier_Token_Text('<3'),
 607                     new HTMLPurifier_Token_Empty('br'),
 608                 ),
 609                 'PEARSax3' => array(
 610                     // bah too lazy to fix this
 611                     new HTMLPurifier_Token_Empty('br'),
 612                     new HTMLPurifier_Token_Empty('3<br'),
 613                 ),
 614             )
 615         );
 616     }
 617
 618     function test_tokenizeHTML_emoticonShiftyEyes() {
 619         $this->assertTokenization(
 620             '<b><<</b>',
 621             array(
 622                 new HTMLPurifier_Token_Start('b'),
 623                 new HTMLPurifier_Token_Text('<'),
 624                 new HTMLPurifier_Token_Text('<'),
 625                 new HTMLPurifier_Token_End('b'),
 626             ),
 627             array(
 628                 'DOMLex' => array(
 629                     new HTMLPurifier_Token_Start('b'),
 630                     new HTMLPurifier_Token_Text('<<'),
 631                     new HTMLPurifier_Token_End('b'),
 632                 ),
 633                 'PEARSax3' => array(
 634                     // also too lazy to fix
 635                     new HTMLPurifier_Token_Start('b'),
 636                     new HTMLPurifier_Token_Empty('<<'),
 637                     new HTMLPurifier_Token_Text('b>'),
 638                 ),
 639             )
 640         );
 641     }
 642
 643     function test_tokenizeHTML_eon1996() {
 644         $this->assertTokenization(
 645             '< <b>test</b>',
 646             array(
 647                 new HTMLPurifier_Token_Text('<'),
 648                 new HTMLPurifier_Token_Text(' '),
 649                 new HTMLPurifier_Token_Start('b'),
 650                 new HTMLPurifier_Token_Text('test'),
 651                 new HTMLPurifier_Token_End('b'),
 652             ),
 653             array(
 654                 'DOMLex' => array(
 655                     new HTMLPurifier_Token_Text('< '),
 656                     new HTMLPurifier_Token_Start('b'),
 657                     new HTMLPurifier_Token_Text('test'),
 658                     new HTMLPurifier_Token_End('b'),
 659                 ),
 660                 'PEARSax3' => array(
 661                     // totally doing the wrong thing here
 662                     new HTMLPurifier_Token_Text(' '),
 663                     new HTMLPurifier_Token_Start('b'),
 664                     new HTMLPurifier_Token_Text('test'),
 665                     new HTMLPurifier_Token_End('b'),
 666                 ),
 667             )
 668         );
 669     }
 670
 671     function test_tokenizeHTML_bodyInCDATA() {
 672         $alt_tokens = array(
 673             new HTMLPurifier_Token_Text('<'),
 674             new HTMLPurifier_Token_Text('body'),
 675             new HTMLPurifier_Token_Text('>'),
 676             new HTMLPurifier_Token_Text('Foo'),
 677             new HTMLPurifier_Token_Text('<'),
 678             new HTMLPurifier_Token_Text('/body'),
 679             new HTMLPurifier_Token_Text('>'),
 680         );
 681         $this->assertTokenization(
 682             '<![CDATA[<body>Foo</body>]]>',
 683             array(
 684                 new HTMLPurifier_Token_Text('<body>Foo</body>'),
 685             ),
 686             array(
 687                 'PH5P' => $alt_tokens,
 688                 'PEARSax3' => $alt_tokens,
 689             )
 690         );
 691     }
 692
 693     function test_tokenizeHTML_() {
 694         $this->assertTokenization(
 695             '<a><img /></a>',
 696             array(
 697                 new HTMLPurifier_Token_Start('a'),
 698                 new HTMLPurifier_Token_Empty('img'),
 699                 new HTMLPurifier_Token_End('a'),
 700             )
 701         );
 702     }
 703
 704     /*
 705
 706     function test_tokenizeHTML_() {
 707         $this->assertTokenization(
 708             ,
 709             array(
 710
 711             )
 712         );
 713     }
 714     */
 715
 716 }
 717
 718 // vim: et sw=4 sts=4