tests/HTMLPurifier/LexerTest.php

   1 <?php
   2
   3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
   4 {
   5
   6     protected $_has_pear = false;
   7
   8     public function __construct() {
   9         parent::__construct();
  10         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  11             require_once 'HTMLPurifier/Lexer/PH5P.php';
  12         }
  13     }
  14
  15     // HTMLPurifier_Lexer::create() --------------------------------------------
  16
  17     function test_create() {
  18         $this->config->set('Core.MaintainLineNumbers', true);
  19         $lexer = HTMLPurifier_Lexer::create($this->config);
  20         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  21     }
  22
  23     function test_create_objectLexerImpl() {
  24         $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  25         $lexer = HTMLPurifier_Lexer::create($this->config);
  26         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  27     }
  28
  29     function test_create_unknownLexer() {
  30         $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  31         $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  32         HTMLPurifier_Lexer::create($this->config);
  33     }
  34
  35     function test_create_incompatibleLexer() {
  36         $this->config->set('Core.LexerImpl', 'DOMLex');
  37         $this->config->set('Core.MaintainLineNumbers', true);
  38         $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  39         HTMLPurifier_Lexer::create($this->config);
  40     }
  41
  42     // HTMLPurifier_Lexer->parseData() -----------------------------------------
  43
  44     function assertParseData($input, $expect = true) {
  45         if ($expect === true) $expect = $input;
  46         $lexer = new HTMLPurifier_Lexer();
  47         $this->assertIdentical($expect, $lexer->parseData($input));
  48     }
  49
  50     function test_parseData_plainText() {
  51         $this->assertParseData('asdf');
  52     }
  53
  54     function test_parseData_ampersandEntity() {
  55         $this->assertParseData('&amp;', '&');
  56     }
  57
  58     function test_parseData_quotEntity() {
  59         $this->assertParseData('&quot;', '"');
  60     }
  61
  62     function test_parseData_aposNumericEntity() {
  63         $this->assertParseData('&#039;', "'");
  64     }
  65
  66     function test_parseData_aposCompactNumericEntity() {
  67         $this->assertParseData('&#39;', "'");
  68     }
  69
  70     function test_parseData_adjacentAmpersandEntities() {
  71         $this->assertParseData('&amp;&amp;&amp;', '&&&');
  72     }
  73
  74     function test_parseData_trailingUnescapedAmpersand() {
  75         $this->assertParseData('&amp;&', '&&');
  76     }
  77
  78     function test_parseData_internalUnescapedAmpersand() {
  79         $this->assertParseData('Procter & Gamble');
  80     }
  81
  82     function test_parseData_improperEntityFaultToleranceTest() {
  83         $this->assertParseData('&#x2D;');
  84     }
  85
  86     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  87
  88     function assertExtractBody($text, $extract = true) {
  89         $lexer = new HTMLPurifier_Lexer();
  90         $result = $lexer->extractBody($text);
  91         if ($extract === true) $extract = $text;
  92         $this->assertIdentical($extract, $result);
  93     }
  94
  95     function test_extractBody_noBodyTags() {
  96         $this->assertExtractBody('<b>Bold</b>');
  97     }
  98
  99     function test_extractBody_lowercaseBodyTags() {
 100         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
 101     }
 102
 103     function test_extractBody_uppercaseBodyTags() {
 104         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
 105     }
 106
 107     function test_extractBody_realisticUseCase() {
 108         $this->assertExtractBody(
 109 '<?xml version="1.0"
 110 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 111     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 112 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 113    <head>
 114       <title>xyz</title>
 115    </head>
 116    <body>
 117       <form method="post" action="whatever1">
 118          <div>
 119             <input type="text" name="username" />
 120             <input type="text" name="password" />
 121             <input type="submit" />
 122          </div>
 123       </form>
 124    </body>
 125 </html>',
 126     '
 127       <form method="post" action="whatever1">
 128          <div>
 129             <input type="text" name="username" />
 130             <input type="text" name="password" />
 131             <input type="submit" />
 132          </div>
 133       </form>
 134    ');
 135     }
 136
 137     function test_extractBody_bodyWithAttributes() {
 138         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 139     }
 140
 141     function test_extractBody_preserveUnclosedBody() {
 142         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 143     }
 144
 145     function test_extractBody_useLastBody() {
 146         $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
 147     }
 148
 149     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 150
 151     function assertTokenization($input, $expect, $alt_expect = array()) {
 152         $lexers = array();
 153         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 154         if (class_exists('DOMDocument')) {
 155             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 156             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 157         }
 158         foreach ($lexers as $name => $lexer) {
 159             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 160             if (isset($alt_expect[$name])) {
 161                 if ($alt_expect[$name] === false) continue;
 162                 $t_expect = $alt_expect[$name];
 163                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 164             } else {
 165                 $t_expect = $expect;
 166                 $this->assertIdentical($result, $expect, "$name: %s");
 167             }
 168             if ($t_expect != $result) {
 169                 printTokens($result);
 170             }
 171         }
 172     }
 173
 174     function test_tokenizeHTML_emptyInput() {
 175         $this->assertTokenization('', array());
 176     }
 177
 178     function test_tokenizeHTML_plainText() {
 179         $this->assertTokenization(
 180             'This is regular text.',
 181             array(
 182                 new HTMLPurifier_Token_Text('This is regular text.')
 183             )
 184         );
 185     }
 186
 187     function test_tokenizeHTML_textAndTags() {
 188         $this->assertTokenization(
 189             'This is <b>bold</b> text',
 190             array(
 191                 new HTMLPurifier_Token_Text('This is '),
 192                 new HTMLPurifier_Token_Start('b', array()),
 193                 new HTMLPurifier_Token_Text('bold'),
 194                 new HTMLPurifier_Token_End('b'),
 195                 new HTMLPurifier_Token_Text(' text'),
 196             )
 197         );
 198     }
 199
 200     function test_tokenizeHTML_normalizeCase() {
 201         $this->assertTokenization(
 202             '<DIV>Totally rad dude. <b>asdf</b></div>',
 203             array(
 204                 new HTMLPurifier_Token_Start('DIV', array()),
 205                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 206                 new HTMLPurifier_Token_Start('b', array()),
 207                 new HTMLPurifier_Token_Text('asdf'),
 208                 new HTMLPurifier_Token_End('b'),
 209                 new HTMLPurifier_Token_End('div'),
 210             )
 211         );
 212     }
 213
 214     function test_tokenizeHTML_notWellFormed() {
 215         $this->assertTokenization(
 216             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 217             array(
 218                 new HTMLPurifier_Token_Start('asdf'),
 219                 new HTMLPurifier_Token_End('asdf'),
 220                 new HTMLPurifier_Token_Start('d'),
 221                 new HTMLPurifier_Token_End('d'),
 222                 new HTMLPurifier_Token_Start('poOloka'),
 223                 new HTMLPurifier_Token_Start('poolasdf'),
 224                 new HTMLPurifier_Token_Start('ds'),
 225                 new HTMLPurifier_Token_End('asdf'),
 226                 new HTMLPurifier_Token_End('ASDF'),
 227             ),
 228             array(
 229                 'DOMLex' => $alt = array(
 230                     new HTMLPurifier_Token_Empty('asdf'),
 231                     new HTMLPurifier_Token_Empty('d'),
 232                     new HTMLPurifier_Token_Start('pooloka'),
 233                     new HTMLPurifier_Token_Start('poolasdf'),
 234                     new HTMLPurifier_Token_Empty('ds'),
 235                     new HTMLPurifier_Token_End('poolasdf'),
 236                     new HTMLPurifier_Token_End('pooloka'),
 237                 ),
 238                 'PH5P' => $alt,
 239             )
 240         );
 241     }
 242
 243     function test_tokenizeHTML_whitespaceInTag() {
 244         $this->assertTokenization(
 245             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 246             array(
 247                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 248                 new HTMLPurifier_Token_Text('Link to '),
 249                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 250                 new HTMLPurifier_Token_Text('foobar'),
 251                 new HTMLPurifier_Token_End('b'),
 252                 new HTMLPurifier_Token_End('a'),
 253             )
 254         );
 255     }
 256
 257     function test_tokenizeHTML_singleAttribute() {
 258         $this->assertTokenization(
 259             '<br style="&amp;" />',
 260             array(
 261                 new HTMLPurifier_Token_Empty('br', array('style' => '&'))
 262             )
 263         );
 264     }
 265
 266     function test_tokenizeHTML_emptyTag() {
 267         $this->assertTokenization(
 268             '<br />',
 269             array( new HTMLPurifier_Token_Empty('br') )
 270         );
 271     }
 272
 273     function test_tokenizeHTML_comment() {
 274         $this->assertTokenization(
 275             '<!-- Comment -->',
 276             array( new HTMLPurifier_Token_Comment(' Comment ') )
 277         );
 278     }
 279
 280     function test_tokenizeHTML_malformedComment() {
 281         $this->assertTokenization(
 282             '<!-- not so well formed --->',
 283             array( new HTMLPurifier_Token_Comment(' not so well formed -') )
 284         );
 285     }
 286
 287     function test_tokenizeHTML_unterminatedTag() {
 288         $this->assertTokenization(
 289             '<a href=""',
 290             array( new HTMLPurifier_Token_Text('<a href=""') ),
 291             array(
 292                 // I like our behavior better, but it's non-standard
 293                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 294                 'PH5P' => false, // total barfing, grabs scaffolding too
 295             )
 296         );
 297     }
 298
 299     function test_tokenizeHTML_specialEntities() {
 300         $this->assertTokenization(
 301             '&lt;b&gt;',
 302             array(
 303                 new HTMLPurifier_Token_Text('<b>')
 304             ),
 305             array(
 306                 // some parsers will separate entities out
 307                 'PH5P' => array(
 308                     new HTMLPurifier_Token_Text('<'),
 309                     new HTMLPurifier_Token_Text('b'),
 310                     new HTMLPurifier_Token_Text('>'),
 311                 ),
 312             )
 313         );
 314     }
 315
 316     function test_tokenizeHTML_earlyQuote() {
 317         $this->assertTokenization(
 318             '<a "=>',
 319             array( new HTMLPurifier_Token_Empty('a') ),
 320             array(
 321                 // we barf on this input
 322                 'DirectLex' => array(
 323                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 324                 ),
 325                 'PH5P' => false, // behavior varies; handle this personally
 326             )
 327         );
 328     }
 329
 330     function test_tokenizeHTML_earlyQuote_PH5P() {
 331         if (!class_exists('DOMDocument')) return;
 332         $lexer = new HTMLPurifier_Lexer_PH5P();
 333         $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
 334         if ($this->context->get('PH5PError', true)) {
 335             $this->assertIdentical(array(
 336                 new HTMLPurifier_Token_Start('a', array('"' => ''))
 337             ), $result);
 338         } else {
 339             $this->assertIdentical(array(
 340                 new HTMLPurifier_Token_Empty('a', array('"' => ''))
 341             ), $result);
 342         }
 343     }
 344
 345     function test_tokenizeHTML_unescapedQuote() {
 346         $this->assertTokenization(
 347             '"',
 348             array( new HTMLPurifier_Token_Text('"') )
 349         );
 350     }
 351
 352     function test_tokenizeHTML_escapedQuote() {
 353         $this->assertTokenization(
 354             '&quot;',
 355             array( new HTMLPurifier_Token_Text('"') )
 356         );
 357     }
 358
 359     function test_tokenizeHTML_cdata() {
 360         $this->assertTokenization(
 361             '<![CDATA[You <b>can&#39;t</b> get me!]]>',
 362             array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
 363             array(
 364                 'PH5P' =>  array(
 365                     new HTMLPurifier_Token_Text('You '),
 366                     new HTMLPurifier_Token_Text('<'),
 367                     new HTMLPurifier_Token_Text('b'),
 368                     new HTMLPurifier_Token_Text('>'),
 369                     new HTMLPurifier_Token_Text('can'),
 370                     new HTMLPurifier_Token_Text('&'),
 371                     new HTMLPurifier_Token_Text('#39;t'),
 372                     new HTMLPurifier_Token_Text('<'),
 373                     new HTMLPurifier_Token_Text('/b'),
 374                     new HTMLPurifier_Token_Text('>'),
 375                     new HTMLPurifier_Token_Text(' get me!'),
 376                 ),
 377             )
 378         );
 379     }
 380
 381     function test_tokenizeHTML_characterEntity() {
 382         $this->assertTokenization(
 383             '&theta;',
 384             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 385         );
 386     }
 387
 388     function test_tokenizeHTML_characterEntityInCDATA() {
 389         $this->assertTokenization(
 390             '<![CDATA[&rarr;]]>',
 391             array( new HTMLPurifier_Token_Text("&rarr;") ),
 392             array(
 393                 'PH5P' => array(
 394                     new HTMLPurifier_Token_Text('&'),
 395                     new HTMLPurifier_Token_Text('rarr;'),
 396                 ),
 397             )
 398         );
 399     }
 400
 401     function test_tokenizeHTML_entityInAttribute() {
 402         $this->assertTokenization(
 403             '<a href="index.php?title=foo&amp;id=bar">Link</a>',
 404             array(
 405                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 406                 new HTMLPurifier_Token_Text('Link'),
 407                 new HTMLPurifier_Token_End('a'),
 408             )
 409         );
 410     }
 411
 412     function test_tokenizeHTML_preserveUTF8() {
 413         $this->assertTokenization(
 414             "\xCE\xB8",
 415             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 416         );
 417     }
 418
 419     function test_tokenizeHTML_specialEntityInAttribute() {
 420         $this->assertTokenization(
 421             '<br test="x &lt; 6" />',
 422             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 423         );
 424     }
 425
 426     function test_tokenizeHTML_emoticonProtection() {
 427         $this->assertTokenization(
 428             '<b>Whoa! <3 That\'s not good >.></b>',
 429             array(
 430                 new HTMLPurifier_Token_Start('b'),
 431                 new HTMLPurifier_Token_Text('Whoa! '),
 432                 new HTMLPurifier_Token_Text('<'),
 433                 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 434                 new HTMLPurifier_Token_End('b')
 435             ),
 436             array(
 437                 // text is absorbed together
 438                 'DOMLex' => array(
 439                     new HTMLPurifier_Token_Start('b'),
 440                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 441                     new HTMLPurifier_Token_End('b'),
 442                 ),
 443                 'PH5P' => array( // interesting grouping
 444                     new HTMLPurifier_Token_Start('b'),
 445                     new HTMLPurifier_Token_Text('Whoa! '),
 446                     new HTMLPurifier_Token_Text('<'),
 447                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 448                     new HTMLPurifier_Token_End('b'),
 449                 ),
 450             )
 451         );
 452     }
 453
 454     function test_tokenizeHTML_commentWithFunkyChars() {
 455         $this->assertTokenization(
 456             '<!-- This >< comment --><br />',
 457             array(
 458                 new HTMLPurifier_Token_Comment(' This >< comment '),
 459                 new HTMLPurifier_Token_Empty('br'),
 460             )
 461         );
 462     }
 463
 464     function test_tokenizeHTML_unterminatedComment() {
 465         $this->assertTokenization(
 466             '<!-- This >< comment',
 467             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 468             array(
 469                 'DOMLex'   => false,
 470                 'PH5P'     => false,
 471             )
 472         );
 473     }
 474
 475     function test_tokenizeHTML_scriptCDATAContents() {
 476         $this->config->set('HTML.Trusted', true);
 477         $this->assertTokenization(
 478             'Foo: <script>alert("<foo>");</script>',
 479             array(
 480                 new HTMLPurifier_Token_Text('Foo: '),
 481                 new HTMLPurifier_Token_Start('script'),
 482                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 483                 new HTMLPurifier_Token_End('script'),
 484             ),
 485             array(
 486                 // PH5P, for some reason, bubbles the script to <head>
 487                 'PH5P' => false,
 488             )
 489         );
 490     }
 491
 492     function test_tokenizeHTML_entitiesInComment() {
 493         $this->assertTokenization(
 494             '<!-- This comment < &lt; & -->',
 495             array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
 496         );
 497     }
 498
 499     function test_tokenizeHTML_attributeWithSpecialCharacters() {
 500         $this->assertTokenization(
 501             '<a href="><>">',
 502             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 503             array(
 504                 'DirectLex' => array(
 505                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 506                     new HTMLPurifier_Token_Text('<'),
 507                     new HTMLPurifier_Token_Text('">'),
 508                 )
 509             )
 510         );
 511     }
 512
 513     function test_tokenizeHTML_emptyTagWithSlashInAttribute() {
 514         $this->assertTokenization(
 515             '<param name="src" value="http://example.com/video.wmv" />',
 516             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 517         );
 518     }
 519
 520     function test_tokenizeHTML_style() {
 521         $extra = array(
 522                 // PH5P doesn't seem to like style tags
 523                 'PH5P' => false,
 524                 // DirectLex defers to RemoveForeignElements for textification
 525                 'DirectLex' => array(
 526                     new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 527                     new HTMLPurifier_Token_Comment("\ndiv {}\n"),
 528                     new HTMLPurifier_Token_End('style'),
 529                 ),
 530             );
 531         if (!defined('LIBXML_VERSION')) {
 532             // LIBXML_VERSION is missing in early versions of PHP
 533             // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
 534             // this translates to 5.0.x. In such cases, punt the test entirely.
 535             return;
 536         } elseif (LIBXML_VERSION < 20628) {
 537             // libxml's behavior is wrong prior to this version, so make
 538             // appropriate accomodations
 539             $extra['DOMLex'] = $extra['DirectLex'];
 540         }
 541         $this->assertTokenization(
 542 '<style type="text/css"><!--
 543 div {}
 544 --></style>',
 545             array(
 546                 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 547                 new HTMLPurifier_Token_Text("\ndiv {}\n"),
 548                 new HTMLPurifier_Token_End('style'),
 549             ),
 550             $extra
 551         );
 552     }
 553
 554     function test_tokenizeHTML_tagWithAtSignAndExtraGt() {
 555         $alt_expect = array(
 556             // Technically this is invalid, but it won't be a
 557             // problem with invalid element removal; also, this
 558             // mimics Mozilla's parsing of the tag.
 559             new HTMLPurifier_Token_Start('a@'),
 560             new HTMLPurifier_Token_Text('>'),
 561         );
 562         $this->assertTokenization(
 563             '<a@>>',
 564             array(
 565                 new HTMLPurifier_Token_Start('a'),
 566                 new HTMLPurifier_Token_Text('>'),
 567                 new HTMLPurifier_Token_End('a'),
 568             ),
 569             array(
 570                 'DirectLex' => $alt_expect,
 571             )
 572         );
 573     }
 574
 575     function test_tokenizeHTML_emoticonHeart() {
 576         $this->assertTokenization(
 577             '<br /><3<br />',
 578             array(
 579                 new HTMLPurifier_Token_Empty('br'),
 580                 new HTMLPurifier_Token_Text('<'),
 581                 new HTMLPurifier_Token_Text('3'),
 582                 new HTMLPurifier_Token_Empty('br'),
 583             ),
 584             array(
 585                 'DOMLex' => array(
 586                     new HTMLPurifier_Token_Empty('br'),
 587                     new HTMLPurifier_Token_Text('<3'),
 588                     new HTMLPurifier_Token_Empty('br'),
 589                 ),
 590             )
 591         );
 592     }
 593
 594     function test_tokenizeHTML_emoticonShiftyEyes() {
 595         $this->assertTokenization(
 596             '<b><<</b>',
 597             array(
 598                 new HTMLPurifier_Token_Start('b'),
 599                 new HTMLPurifier_Token_Text('<'),
 600                 new HTMLPurifier_Token_Text('<'),
 601                 new HTMLPurifier_Token_End('b'),
 602             ),
 603             array(
 604                 'DOMLex' => array(
 605                     new HTMLPurifier_Token_Start('b'),
 606                     new HTMLPurifier_Token_Text('<<'),
 607                     new HTMLPurifier_Token_End('b'),
 608                 ),
 609             )
 610         );
 611     }
 612
 613     function test_tokenizeHTML_eon1996() {
 614         $this->assertTokenization(
 615             '< <b>test</b>',
 616             array(
 617                 new HTMLPurifier_Token_Text('<'),
 618                 new HTMLPurifier_Token_Text(' '),
 619                 new HTMLPurifier_Token_Start('b'),
 620                 new HTMLPurifier_Token_Text('test'),
 621                 new HTMLPurifier_Token_End('b'),
 622             ),
 623             array(
 624                 'DOMLex' => array(
 625                     new HTMLPurifier_Token_Text('< '),
 626                     new HTMLPurifier_Token_Start('b'),
 627                     new HTMLPurifier_Token_Text('test'),
 628                     new HTMLPurifier_Token_End('b'),
 629                 ),
 630             )
 631         );
 632     }
 633
 634     function test_tokenizeHTML_bodyInCDATA() {
 635         $alt_tokens = array(
 636             new HTMLPurifier_Token_Text('<'),
 637             new HTMLPurifier_Token_Text('body'),
 638             new HTMLPurifier_Token_Text('>'),
 639             new HTMLPurifier_Token_Text('Foo'),
 640             new HTMLPurifier_Token_Text('<'),
 641             new HTMLPurifier_Token_Text('/body'),
 642             new HTMLPurifier_Token_Text('>'),
 643         );
 644         $this->assertTokenization(
 645             '<![CDATA[<body>Foo</body>]]>',
 646             array(
 647                 new HTMLPurifier_Token_Text('<body>Foo</body>'),
 648             ),
 649             array(
 650                 'PH5P' => $alt_tokens,
 651             )
 652         );
 653     }
 654
 655     function test_tokenizeHTML_() {
 656         $this->assertTokenization(
 657             '<a><img /></a>',
 658             array(
 659                 new HTMLPurifier_Token_Start('a'),
 660                 new HTMLPurifier_Token_Empty('img'),
 661                 new HTMLPurifier_Token_End('a'),
 662             )
 663         );
 664     }
 665
 666     function test_tokenizeHTML_ignoreIECondComment() {
 667         $this->assertTokenization(
 668             '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
 669             array()
 670         );
 671     }
 672
 673     function test_tokenizeHTML_removeProcessingInstruction() {
 674         $this->config->set('Core.RemoveProcessingInstructions', true);
 675         $this->assertTokenization(
 676             '<?xml blah blah ?>',
 677             array()
 678         );
 679     }
 680
 681    function test_tokenizeHTML_removeNewline() {
 682         $this->config->set('Core.NormalizeNewlines', true);
 683         $this->assertTokenization(
 684             "plain\rtext\r\n",
 685             array(
 686                 new HTMLPurifier_Token_Text("plain\ntext\n")
 687             )
 688         );
 689    }
 690
 691    function test_tokenizeHTML_noRemoveNewline() {
 692         $this->config->set('Core.NormalizeNewlines', false);
 693         $this->assertTokenization(
 694             "plain\rtext\r\n",
 695             array(
 696                 new HTMLPurifier_Token_Text("plain\rtext\r\n")
 697             )
 698         );
 699      }
 700
 701     function test_tokenizeHTML_conditionalCommentUngreedy() {
 702         $this->assertTokenization(
 703             '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
 704             array(
 705                 new HTMLPurifier_Token_Text("b")
 706             )
 707         );
 708     }
 709
 710     function test_tokenizeHTML_imgTag() {
 711         $start = array(
 712                         new HTMLPurifier_Token_Start('img',
 713                             array(
 714                                 'src' => 'img_11775.jpg',
 715                                 'alt' => '[Img #11775]',
 716                                 'id' => 'EMBEDDED_IMG_11775',
 717                             )
 718                         )
 719                     );
 720         $this->assertTokenization(
 721             '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
 722             array(
 723                 new HTMLPurifier_Token_Empty('img',
 724                     array(
 725                         'src' => 'img_11775.jpg',
 726                         'alt' => '[Img #11775]',
 727                         'id' => 'EMBEDDED_IMG_11775',
 728                     )
 729                 )
 730             ),
 731             array(
 732                 'DirectLex' => $start,
 733                 )
 734         );
 735     }
 736
 737
 738     /*
 739
 740     function test_tokenizeHTML_() {
 741         $this->assertTokenization(
 742             ,
 743             array(
 744
 745             )
 746         );
 747     }
 748     */
 749
 750 }
 751
 752 // vim: et sw=4 sts=4