tests/HTMLPurifier/LexerTest.php

   1 <?php
   2
   3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
   4 {
   5
   6     protected $_has_pear = false;
   7
   8     public function __construct() {
   9         parent::__construct();
  10         if ($GLOBALS['HTMLPurifierTest']['PEAR'] &&
  11         // PEARSax3 is not maintained and throws loads of DEPRECATED
  12         // errors in PHP 5.3
  13         version_compare(PHP_VERSION, '5.3', '<')) {
  14             require_once 'HTMLPurifier/Lexer/PEARSax3.php';
  15             $this->_has_pear = true;
  16         }
  17         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  18             require_once 'HTMLPurifier/Lexer/PH5P.php';
  19         }
  20     }
  21
  22     // HTMLPurifier_Lexer::create() --------------------------------------------
  23
  24     function test_create() {
  25         $this->config->set('Core.MaintainLineNumbers', true);
  26         $lexer = HTMLPurifier_Lexer::create($this->config);
  27         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  28     }
  29
  30     function test_create_objectLexerImpl() {
  31         $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  32         $lexer = HTMLPurifier_Lexer::create($this->config);
  33         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  34     }
  35
  36     function test_create_unknownLexer() {
  37         $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  38         $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  39         HTMLPurifier_Lexer::create($this->config);
  40     }
  41
  42     function test_create_incompatibleLexer() {
  43         $this->config->set('Core.LexerImpl', 'DOMLex');
  44         $this->config->set('Core.MaintainLineNumbers', true);
  45         $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  46         HTMLPurifier_Lexer::create($this->config);
  47     }
  48
  49     // HTMLPurifier_Lexer->parseData() -----------------------------------------
  50
  51     function assertParseData($input, $expect = true) {
  52         if ($expect === true) $expect = $input;
  53         $lexer = new HTMLPurifier_Lexer();
  54         $this->assertIdentical($expect, $lexer->parseData($input));
  55     }
  56
  57     function test_parseData_plainText() {
  58         $this->assertParseData('asdf');
  59     }
  60
  61     function test_parseData_ampersandEntity() {
  62         $this->assertParseData('&amp;', '&');
  63     }
  64
  65     function test_parseData_quotEntity() {
  66         $this->assertParseData('&quot;', '"');
  67     }
  68
  69     function test_parseData_aposNumericEntity() {
  70         $this->assertParseData('&#039;', "'");
  71     }
  72
  73     function test_parseData_aposCompactNumericEntity() {
  74         $this->assertParseData('&#39;', "'");
  75     }
  76
  77     function test_parseData_adjacentAmpersandEntities() {
  78         $this->assertParseData('&amp;&amp;&amp;', '&&&');
  79     }
  80
  81     function test_parseData_trailingUnescapedAmpersand() {
  82         $this->assertParseData('&amp;&', '&&');
  83     }
  84
  85     function test_parseData_internalUnescapedAmpersand() {
  86         $this->assertParseData('Procter & Gamble');
  87     }
  88
  89     function test_parseData_improperEntityFaultToleranceTest() {
  90         $this->assertParseData('&#x2D;');
  91     }
  92
  93     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  94
  95     function assertExtractBody($text, $extract = true) {
  96         $lexer = new HTMLPurifier_Lexer();
  97         $result = $lexer->extractBody($text);
  98         if ($extract === true) $extract = $text;
  99         $this->assertIdentical($extract, $result);
 100     }
 101
 102     function test_extractBody_noBodyTags() {
 103         $this->assertExtractBody('<b>Bold</b>');
 104     }
 105
 106     function test_extractBody_lowercaseBodyTags() {
 107         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
 108     }
 109
 110     function test_extractBody_uppercaseBodyTags() {
 111         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
 112     }
 113
 114     function test_extractBody_realisticUseCase() {
 115         $this->assertExtractBody(
 116 '<?xml version="1.0"
 117 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 118     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 119 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 120    <head>
 121       <title>xyz</title>
 122    </head>
 123    <body>
 124       <form method="post" action="whatever1">
 125          <div>
 126             <input type="text" name="username" />
 127             <input type="text" name="password" />
 128             <input type="submit" />
 129          </div>
 130       </form>
 131    </body>
 132 </html>',
 133     '
 134       <form method="post" action="whatever1">
 135          <div>
 136             <input type="text" name="username" />
 137             <input type="text" name="password" />
 138             <input type="submit" />
 139          </div>
 140       </form>
 141    ');
 142     }
 143
 144     function test_extractBody_bodyWithAttributes() {
 145         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 146     }
 147
 148     function test_extractBody_preserveUnclosedBody() {
 149         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 150     }
 151
 152     function test_extractBody_useLastBody() {
 153         $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
 154     }
 155
 156     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 157
 158     function assertTokenization($input, $expect, $alt_expect = array()) {
 159         $lexers = array();
 160         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 161         if ($this->_has_pear) $lexers['PEARSax3']   = new HTMLPurifier_Lexer_PEARSax3();
 162         if (class_exists('DOMDocument')) {
 163             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 164             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 165         }
 166         foreach ($lexers as $name => $lexer) {
 167             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 168             if (isset($alt_expect[$name])) {
 169                 if ($alt_expect[$name] === false) continue;
 170                 $t_expect = $alt_expect[$name];
 171                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 172             } else {
 173                 $t_expect = $expect;
 174                 $this->assertIdentical($result, $expect, "$name: %s");
 175             }
 176             if ($t_expect != $result) {
 177                 printTokens($result);
 178             }
 179         }
 180     }
 181
 182     function test_tokenizeHTML_emptyInput() {
 183         $this->assertTokenization('', array());
 184     }
 185
 186     function test_tokenizeHTML_plainText() {
 187         $this->assertTokenization(
 188             'This is regular text.',
 189             array(
 190                 new HTMLPurifier_Token_Text('This is regular text.')
 191             )
 192         );
 193     }
 194
 195     function test_tokenizeHTML_textAndTags() {
 196         $this->assertTokenization(
 197             'This is <b>bold</b> text',
 198             array(
 199                 new HTMLPurifier_Token_Text('This is '),
 200                 new HTMLPurifier_Token_Start('b', array()),
 201                 new HTMLPurifier_Token_Text('bold'),
 202                 new HTMLPurifier_Token_End('b'),
 203                 new HTMLPurifier_Token_Text(' text'),
 204             )
 205         );
 206     }
 207
 208     function test_tokenizeHTML_normalizeCase() {
 209         $this->assertTokenization(
 210             '<DIV>Totally rad dude. <b>asdf</b></div>',
 211             array(
 212                 new HTMLPurifier_Token_Start('DIV', array()),
 213                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 214                 new HTMLPurifier_Token_Start('b', array()),
 215                 new HTMLPurifier_Token_Text('asdf'),
 216                 new HTMLPurifier_Token_End('b'),
 217                 new HTMLPurifier_Token_End('div'),
 218             )
 219         );
 220     }
 221
 222     function test_tokenizeHTML_notWellFormed() {
 223         $this->assertTokenization(
 224             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 225             array(
 226                 new HTMLPurifier_Token_Start('asdf'),
 227                 new HTMLPurifier_Token_End('asdf'),
 228                 new HTMLPurifier_Token_Start('d'),
 229                 new HTMLPurifier_Token_End('d'),
 230                 new HTMLPurifier_Token_Start('poOloka'),
 231                 new HTMLPurifier_Token_Start('poolasdf'),
 232                 new HTMLPurifier_Token_Start('ds'),
 233                 new HTMLPurifier_Token_End('asdf'),
 234                 new HTMLPurifier_Token_End('ASDF'),
 235             ),
 236             array(
 237                 'DOMLex' => $alt = array(
 238                     new HTMLPurifier_Token_Empty('asdf'),
 239                     new HTMLPurifier_Token_Empty('d'),
 240                     new HTMLPurifier_Token_Start('pooloka'),
 241                     new HTMLPurifier_Token_Start('poolasdf'),
 242                     new HTMLPurifier_Token_Empty('ds'),
 243                     new HTMLPurifier_Token_End('poolasdf'),
 244                     new HTMLPurifier_Token_End('pooloka'),
 245                 ),
 246                 'PH5P' => $alt,
 247             )
 248         );
 249     }
 250
 251     function test_tokenizeHTML_whitespaceInTag() {
 252         $this->assertTokenization(
 253             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 254             array(
 255                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 256                 new HTMLPurifier_Token_Text('Link to '),
 257                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 258                 new HTMLPurifier_Token_Text('foobar'),
 259                 new HTMLPurifier_Token_End('b'),
 260                 new HTMLPurifier_Token_End('a'),
 261             )
 262         );
 263     }
 264
 265     function test_tokenizeHTML_singleAttribute() {
 266         $this->assertTokenization(
 267             '<br style="&amp;" />',
 268             array(
 269                 new HTMLPurifier_Token_Empty('br', array('style' => '&'))
 270             )
 271         );
 272     }
 273
 274     function test_tokenizeHTML_emptyTag() {
 275         $this->assertTokenization(
 276             '<br />',
 277             array( new HTMLPurifier_Token_Empty('br') )
 278         );
 279     }
 280
 281     function test_tokenizeHTML_comment() {
 282         $this->assertTokenization(
 283             '<!-- Comment -->',
 284             array( new HTMLPurifier_Token_Comment(' Comment ') )
 285         );
 286     }
 287
 288     function test_tokenizeHTML_malformedComment() {
 289         $this->assertTokenization(
 290             '<!-- not so well formed --->',
 291             array( new HTMLPurifier_Token_Comment(' not so well formed -') )
 292         );
 293     }
 294
 295     function test_tokenizeHTML_unterminatedTag() {
 296         $this->assertTokenization(
 297             '<a href=""',
 298             array( new HTMLPurifier_Token_Text('<a href=""') ),
 299             array(
 300                 // I like our behavior better, but it's non-standard
 301                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 302                 'PEARSax3' => array( new HTMLPurifier_Token_Start('a', array('href'=>'')) ),
 303                 'PH5P' => false, // total barfing, grabs scaffolding too
 304             )
 305         );
 306     }
 307
 308     function test_tokenizeHTML_specialEntities() {
 309         $this->assertTokenization(
 310             '&lt;b&gt;',
 311             array(
 312                 new HTMLPurifier_Token_Text('<b>')
 313             ),
 314             array(
 315                 // some parsers will separate entities out
 316                 'PEARSax3' => $split = array(
 317                     new HTMLPurifier_Token_Text('<'),
 318                     new HTMLPurifier_Token_Text('b'),
 319                     new HTMLPurifier_Token_Text('>'),
 320                 ),
 321                 'PH5P' => $split,
 322             )
 323         );
 324     }
 325
 326     function test_tokenizeHTML_earlyQuote() {
 327         $this->assertTokenization(
 328             '<a "=>',
 329             array( new HTMLPurifier_Token_Empty('a') ),
 330             array(
 331                 // we barf on this input
 332                 'DirectLex' => $tokens = array(
 333                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 334                 ),
 335                 'PEARSax3' => $tokens,
 336                 'PH5P' => false, // behavior varies; handle this personally
 337             )
 338         );
 339     }
 340
 341     function test_tokenizeHTML_earlyQuote_PH5P() {
 342         if (!class_exists('DOMDocument')) return;
 343         $lexer = new HTMLPurifier_Lexer_PH5P();
 344         $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
 345         if ($this->context->get('PH5PError', true)) {
 346             $this->assertIdentical(array(
 347                 new HTMLPurifier_Token_Start('a', array('"' => ''))
 348             ), $result);
 349         } else {
 350             $this->assertIdentical(array(
 351                 new HTMLPurifier_Token_Empty('a', array('"' => ''))
 352             ), $result);
 353         }
 354     }
 355
 356     function test_tokenizeHTML_unescapedQuote() {
 357         $this->assertTokenization(
 358             '"',
 359             array( new HTMLPurifier_Token_Text('"') )
 360         );
 361     }
 362
 363     function test_tokenizeHTML_escapedQuote() {
 364         $this->assertTokenization(
 365             '&quot;',
 366             array( new HTMLPurifier_Token_Text('"') ),
 367             array(
 368                 'PEARSax3' => false, // PEAR barfs on this
 369             )
 370         );
 371     }
 372
 373     function test_tokenizeHTML_cdata() {
 374         $this->assertTokenization(
 375             '<![CDATA[You <b>can&#39;t</b> get me!]]>',
 376             array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
 377             array(
 378                 // PEAR splits up all of the CDATA
 379                 'PEARSax3' => $split = array(
 380                     new HTMLPurifier_Token_Text('You '),
 381                     new HTMLPurifier_Token_Text('<'),
 382                     new HTMLPurifier_Token_Text('b'),
 383                     new HTMLPurifier_Token_Text('>'),
 384                     new HTMLPurifier_Token_Text('can'),
 385                     new HTMLPurifier_Token_Text('&'),
 386                     new HTMLPurifier_Token_Text('#39;t'),
 387                     new HTMLPurifier_Token_Text('<'),
 388                     new HTMLPurifier_Token_Text('/b'),
 389                     new HTMLPurifier_Token_Text('>'),
 390                     new HTMLPurifier_Token_Text(' get me!'),
 391                 ),
 392                 'PH5P' => $split,
 393             )
 394         );
 395     }
 396
 397     function test_tokenizeHTML_characterEntity() {
 398         $this->assertTokenization(
 399             '&theta;',
 400             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 401         );
 402     }
 403
 404     function test_tokenizeHTML_characterEntityInCDATA() {
 405         $this->assertTokenization(
 406             '<![CDATA[&rarr;]]>',
 407             array( new HTMLPurifier_Token_Text("&rarr;") ),
 408             array(
 409                 'PEARSax3' => $split = array(
 410                     new HTMLPurifier_Token_Text('&'),
 411                     new HTMLPurifier_Token_Text('rarr;'),
 412                 ),
 413                 'PH5P' => $split,
 414             )
 415         );
 416     }
 417
 418     function test_tokenizeHTML_entityInAttribute() {
 419         $this->assertTokenization(
 420             '<a href="index.php?title=foo&amp;id=bar">Link</a>',
 421             array(
 422                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 423                 new HTMLPurifier_Token_Text('Link'),
 424                 new HTMLPurifier_Token_End('a'),
 425             )
 426         );
 427     }
 428
 429     function test_tokenizeHTML_preserveUTF8() {
 430         $this->assertTokenization(
 431             "\xCE\xB8",
 432             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 433         );
 434     }
 435
 436     function test_tokenizeHTML_specialEntityInAttribute() {
 437         $this->assertTokenization(
 438             '<br test="x &lt; 6" />',
 439             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 440         );
 441     }
 442
 443     function test_tokenizeHTML_emoticonProtection() {
 444         $this->assertTokenization(
 445             '<b>Whoa! <3 That\'s not good >.></b>',
 446             array(
 447                 new HTMLPurifier_Token_Start('b'),
 448                 new HTMLPurifier_Token_Text('Whoa! '),
 449                 new HTMLPurifier_Token_Text('<'),
 450                 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 451                 new HTMLPurifier_Token_End('b')
 452             ),
 453             array(
 454                 // text is absorbed together
 455                 'DOMLex' => array(
 456                     new HTMLPurifier_Token_Start('b'),
 457                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 458                     new HTMLPurifier_Token_End('b'),
 459                 ),
 460                 'PEARSax3' => false, // totally mangled
 461                 'PH5P' => array( // interesting grouping
 462                     new HTMLPurifier_Token_Start('b'),
 463                     new HTMLPurifier_Token_Text('Whoa! '),
 464                     new HTMLPurifier_Token_Text('<'),
 465                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 466                     new HTMLPurifier_Token_End('b'),
 467                 ),
 468             )
 469         );
 470     }
 471
 472     function test_tokenizeHTML_commentWithFunkyChars() {
 473         $this->assertTokenization(
 474             '<!-- This >< comment --><br />',
 475             array(
 476                 new HTMLPurifier_Token_Comment(' This >< comment '),
 477                 new HTMLPurifier_Token_Empty('br'),
 478             ),
 479             array(
 480                 'PEARSax3' => false,
 481             )
 482         );
 483     }
 484
 485     function test_tokenizeHTML_unterminatedComment() {
 486         $this->assertTokenization(
 487             '<!-- This >< comment',
 488             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 489             array(
 490                 'DOMLex'   => false,
 491                 'PEARSax3' => false,
 492                 'PH5P'     => false,
 493             )
 494         );
 495     }
 496
 497     function test_tokenizeHTML_scriptCDATAContents() {
 498         $this->config->set('HTML.Trusted', true);
 499         $this->assertTokenization(
 500             'Foo: <script>alert("<foo>");</script>',
 501             array(
 502                 new HTMLPurifier_Token_Text('Foo: '),
 503                 new HTMLPurifier_Token_Start('script'),
 504                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 505                 new HTMLPurifier_Token_End('script'),
 506             ),
 507             array(
 508                 'PEARSax3' => false,
 509                 // PH5P, for some reason, bubbles the script to <head>
 510                 'PH5P' => false,
 511             )
 512         );
 513     }
 514
 515     function test_tokenizeHTML_entitiesInComment() {
 516         $this->assertTokenization(
 517             '<!-- This comment < &lt; & -->',
 518             array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') ),
 519             array(
 520                 'PEARSax3' => false
 521             )
 522         );
 523     }
 524
 525     function test_tokenizeHTML_attributeWithSpecialCharacters() {
 526         $this->assertTokenization(
 527             '<a href="><>">',
 528             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 529             array(
 530                 'DirectLex' => array(
 531                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 532                     new HTMLPurifier_Token_Text('<'),
 533                     new HTMLPurifier_Token_Text('">'),
 534                 ),
 535                 'PEARSax3' => false,
 536             )
 537         );
 538     }
 539
 540     function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
 541         $this->assertTokenization(
 542             '<param name="src" value="http://example.com/video.wmv" />',
 543             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 544         );
 545     }
 546
 547     function test_tokenizeHTML_style() {
 548         $extra = array(
 549                 // PH5P doesn't seem to like style tags
 550                 'PH5P' => false,
 551                 // DirectLex defers to RemoveForeignElements for textification
 552                 'DirectLex' => array(
 553                     new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 554                     new HTMLPurifier_Token_Comment("\ndiv {}\n"),
 555                     new HTMLPurifier_Token_End('style'),
 556                 ),
 557             );
 558         if (!defined('LIBXML_VERSION')) {
 559             // LIBXML_VERSION is missing in early versions of PHP
 560             // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
 561             // this translates to 5.0.x. In such cases, punt the test entirely.
 562             return;
 563         } elseif (LIBXML_VERSION < 20628) {
 564             // libxml's behavior is wrong prior to this version, so make
 565             // appropriate accomodations
 566             $extra['DOMLex'] = $extra['DirectLex'];
 567         }
 568         $this->assertTokenization(
 569 '<style type="text/css"><!--
 570 div {}
 571 --></style>',
 572             array(
 573                 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 574                 new HTMLPurifier_Token_Text("\ndiv {}\n"),
 575                 new HTMLPurifier_Token_End('style'),
 576             ),
 577             $extra
 578         );
 579     }
 580
 581     function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
 582         $alt_expect = array(
 583             // Technically this is invalid, but it won't be a
 584             // problem with invalid element removal; also, this
 585             // mimics Mozilla's parsing of the tag.
 586             new HTMLPurifier_Token_Start('a@'),
 587             new HTMLPurifier_Token_Text('>'),
 588         );
 589         $this->assertTokenization(
 590             '<a@>>',
 591             array(
 592                 new HTMLPurifier_Token_Start('a'),
 593                 new HTMLPurifier_Token_Text('>'),
 594                 new HTMLPurifier_Token_End('a'),
 595             ),
 596             array(
 597                 'DirectLex' => $alt_expect,
 598                 'PEARSax3' => $alt_expect,
 599             )
 600         );
 601     }
 602
 603     function test_tokenizeHTML_emoticonHeart() {
 604         $this->assertTokenization(
 605             '<br /><3<br />',
 606             array(
 607                 new HTMLPurifier_Token_Empty('br'),
 608                 new HTMLPurifier_Token_Text('<'),
 609                 new HTMLPurifier_Token_Text('3'),
 610                 new HTMLPurifier_Token_Empty('br'),
 611             ),
 612             array(
 613                 'DOMLex' => array(
 614                     new HTMLPurifier_Token_Empty('br'),
 615                     new HTMLPurifier_Token_Text('<3'),
 616                     new HTMLPurifier_Token_Empty('br'),
 617                 ),
 618                 'PEARSax3' => array(
 619                     // bah too lazy to fix this
 620                     new HTMLPurifier_Token_Empty('br'),
 621                     new HTMLPurifier_Token_Empty('3<br'),
 622                 ),
 623             )
 624         );
 625     }
 626
 627     function test_tokenizeHTML_emoticonShiftyEyes() {
 628         $this->assertTokenization(
 629             '<b><<</b>',
 630             array(
 631                 new HTMLPurifier_Token_Start('b'),
 632                 new HTMLPurifier_Token_Text('<'),
 633                 new HTMLPurifier_Token_Text('<'),
 634                 new HTMLPurifier_Token_End('b'),
 635             ),
 636             array(
 637                 'DOMLex' => array(
 638                     new HTMLPurifier_Token_Start('b'),
 639                     new HTMLPurifier_Token_Text('<<'),
 640                     new HTMLPurifier_Token_End('b'),
 641                 ),
 642                 'PEARSax3' => array(
 643                     // also too lazy to fix
 644                     new HTMLPurifier_Token_Start('b'),
 645                     new HTMLPurifier_Token_Empty('<<'),
 646                     new HTMLPurifier_Token_Text('b>'),
 647                 ),
 648             )
 649         );
 650     }
 651
 652     function test_tokenizeHTML_eon1996() {
 653         $this->assertTokenization(
 654             '< <b>test</b>',
 655             array(
 656                 new HTMLPurifier_Token_Text('<'),
 657                 new HTMLPurifier_Token_Text(' '),
 658                 new HTMLPurifier_Token_Start('b'),
 659                 new HTMLPurifier_Token_Text('test'),
 660                 new HTMLPurifier_Token_End('b'),
 661             ),
 662             array(
 663                 'DOMLex' => array(
 664                     new HTMLPurifier_Token_Text('< '),
 665                     new HTMLPurifier_Token_Start('b'),
 666                     new HTMLPurifier_Token_Text('test'),
 667                     new HTMLPurifier_Token_End('b'),
 668                 ),
 669                 'PEARSax3' => array(
 670                     // totally doing the wrong thing here
 671                     new HTMLPurifier_Token_Text(' '),
 672                     new HTMLPurifier_Token_Start('b'),
 673                     new HTMLPurifier_Token_Text('test'),
 674                     new HTMLPurifier_Token_End('b'),
 675                 ),
 676             )
 677         );
 678     }
 679
 680     function test_tokenizeHTML_bodyInCDATA() {
 681         $alt_tokens = array(
 682             new HTMLPurifier_Token_Text('<'),
 683             new HTMLPurifier_Token_Text('body'),
 684             new HTMLPurifier_Token_Text('>'),
 685             new HTMLPurifier_Token_Text('Foo'),
 686             new HTMLPurifier_Token_Text('<'),
 687             new HTMLPurifier_Token_Text('/body'),
 688             new HTMLPurifier_Token_Text('>'),
 689         );
 690         $this->assertTokenization(
 691             '<![CDATA[<body>Foo</body>]]>',
 692             array(
 693                 new HTMLPurifier_Token_Text('<body>Foo</body>'),
 694             ),
 695             array(
 696                 'PH5P' => $alt_tokens,
 697                 'PEARSax3' => $alt_tokens,
 698             )
 699         );
 700     }
 701
 702     function test_tokenizeHTML_() {
 703         $this->assertTokenization(
 704             '<a><img /></a>',
 705             array(
 706                 new HTMLPurifier_Token_Start('a'),
 707                 new HTMLPurifier_Token_Empty('img'),
 708                 new HTMLPurifier_Token_End('a'),
 709             )
 710         );
 711     }
 712
 713     function test_tokenizeHTML_ignoreIECondComment() {
 714         $this->assertTokenization(
 715             '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
 716             array()
 717         );
 718     }
 719
 720     function test_tokenizeHTML_removeProcessingInstruction() {
 721         $this->config->set('Core.RemoveProcessingInstructions', true);
 722         $this->assertTokenization(
 723             '<?xml blah blah ?>',
 724             array()
 725         );
 726     }
 727
 728    function test_tokenizeHTML_removeNewline() {
 729         $this->config->set('Core.NormalizeNewlines', true);
 730         $this->assertTokenization(
 731             "plain\rtext\r\n",
 732             array(
 733                 new HTMLPurifier_Token_Text("plain\ntext\n")
 734             )
 735         );
 736    }
 737
 738    function test_tokenizeHTML_noRemoveNewline() {
 739         $this->config->set('Core.NormalizeNewlines', false);
 740         $this->assertTokenization(
 741             "plain\rtext\r\n",
 742             array(
 743                 new HTMLPurifier_Token_Text("plain\rtext\r\n")
 744             )
 745         );
 746      }
 747
 748     function test_tokenizeHTML_conditionalCommentUngreedy() {
 749         $this->assertTokenization(
 750             '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
 751             array(
 752                 new HTMLPurifier_Token_Text("b")
 753             )
 754         );
 755     }
 756
 757     function test_tokenizeHTML_imgTag() {
 758         $start = array(
 759                         new HTMLPurifier_Token_Start('img',
 760                             array(
 761                                 'src' => 'img_11775.jpg',
 762                                 'alt' => '[Img #11775]',
 763                                 'id' => 'EMBEDDED_IMG_11775',
 764                             )
 765                         )
 766                     );
 767         $this->assertTokenization(
 768             '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
 769             array(
 770                 new HTMLPurifier_Token_Empty('img',
 771                     array(
 772                         'src' => 'img_11775.jpg',
 773                         'alt' => '[Img #11775]',
 774                         'id' => 'EMBEDDED_IMG_11775',
 775                     )
 776                 )
 777             ),
 778             array(
 779                 'DirectLex' => $start,
 780                 'PEARSax3' => $start,
 781                 )
 782         );
 783     }
 784
 785
 786     /*
 787
 788     function test_tokenizeHTML_() {
 789         $this->assertTokenization(
 790             ,
 791             array(
 792
 793             )
 794         );
 795     }
 796     */
 797
 798 }
 799
 800 // vim: et sw=4 sts=4