tests/HTMLPurifier/LexerTest.php

   1 <?php
   2
   3 class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
   4 {
   5
   6     protected $_has_pear = false;
   7
   8     public function __construct()
   9     {
  10         parent::__construct();
  11         if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  12             require_once 'HTMLPurifier/Lexer/PH5P.php';
  13         }
  14     }
  15
  16     // HTMLPurifier_Lexer::create() --------------------------------------------
  17
  18     public function test_create()
  19     {
  20         $this->config->set('Core.MaintainLineNumbers', true);
  21         $lexer = HTMLPurifier_Lexer::create($this->config);
  22         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  23     }
  24
  25     public function test_create_objectLexerImpl()
  26     {
  27         $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  28         $lexer = HTMLPurifier_Lexer::create($this->config);
  29         $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  30     }
  31
  32     public function test_create_unknownLexer()
  33     {
  34         $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  35         $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  36         HTMLPurifier_Lexer::create($this->config);
  37     }
  38
  39     public function test_create_incompatibleLexer()
  40     {
  41         $this->config->set('Core.LexerImpl', 'DOMLex');
  42         $this->config->set('Core.MaintainLineNumbers', true);
  43         $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  44         HTMLPurifier_Lexer::create($this->config);
  45     }
  46
  47     // HTMLPurifier_Lexer->parseData() -----------------------------------------
  48
  49     public function assertParseData($input, $expect = true, $is_attr = false)
  50     {
  51         if ($expect === true) $expect = $input;
  52         $lexer = new HTMLPurifier_Lexer();
  53         $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config));
  54     }
  55
  56     public function test_parseData_plainText()
  57     {
  58         $this->assertParseData('asdf');
  59     }
  60
  61     public function test_parseData_ampersandEntity()
  62     {
  63         $this->assertParseData('&amp;', '&');
  64     }
  65
  66     public function test_parseData_quotEntity()
  67     {
  68         $this->assertParseData('&quot;', '"');
  69     }
  70
  71     public function test_parseData_aposNumericEntity()
  72     {
  73         $this->assertParseData('&#039;', "'");
  74     }
  75
  76     public function test_parseData_aposCompactNumericEntity()
  77     {
  78         $this->assertParseData('&#39;', "'");
  79     }
  80
  81     public function test_parseData_adjacentAmpersandEntities()
  82     {
  83         $this->assertParseData('&amp;&amp;&amp;', '&&&');
  84     }
  85
  86     public function test_parseData_trailingUnescapedAmpersand()
  87     {
  88         $this->assertParseData('&amp;&', '&&');
  89     }
  90
  91     public function test_parseData_internalUnescapedAmpersand()
  92     {
  93         $this->assertParseData('Procter & Gamble');
  94     }
  95
  96     public function test_parseData_improperEntityFaultToleranceTest()
  97     {
  98         $this->assertParseData('&#x2D;', '-');
  99     }
 100
 101     public function test_parseData_noTrailingSemi()
 102     {
 103         $this->assertParseData('&ampA', '&A');
 104     }
 105
 106     public function test_parseData_noTrailingSemiAttr()
 107     {
 108         $this->assertParseData('&ampA', '&ampA', true);
 109     }
 110
 111     public function test_parseData_T119()
 112     {
 113         $this->assertParseData('&ampA', '&ampA', true);
 114     }
 115
 116     public function test_parseData_T119b()
 117     {
 118         $this->assertParseData('&trade=', true, true);
 119     }
 120
 121     public function test_parseData_legacy1()
 122     {
 123         $this->config->set('Core.LegacyEntityDecoder', true);
 124         $this->assertParseData('&ampa', true);
 125         $this->assertParseData('&amp=', "&=");
 126         $this->assertParseData('&ampa', true, true);
 127         $this->assertParseData('&amp=', "&=", true);
 128         $this->assertParseData('&lta', true);
 129         $this->assertParseData('&lt=', "<=");
 130         $this->assertParseData('&lta', true, true);
 131         $this->assertParseData('&lt=', "<=", true);
 132     }
 133
 134     public function test_parseData_nonlegacy1()
 135     {
 136         $this->assertParseData('&ampa', "&a");
 137         $this->assertParseData('&amp=', "&=");
 138         $this->assertParseData('&ampa', true, true);
 139         $this->assertParseData('&amp=', true, true);
 140         $this->assertParseData('&lta', "<a");
 141         $this->assertParseData('&lt=', "<=");
 142         $this->assertParseData('&lta', true, true);
 143         $this->assertParseData('&lt=', true, true);
 144         $this->assertParseData('&lta;', "<a;");
 145     }
 146
 147     public function test_parseData_noTrailingSemiNever()
 148     {
 149         $this->assertParseData('&imath');
 150     }
 151
 152     // HTMLPurifier_Lexer->extractBody() ---------------------------------------
 153
 154     public function assertExtractBody($text, $extract = true)
 155     {
 156         $lexer = new HTMLPurifier_Lexer();
 157         $result = $lexer->extractBody($text);
 158         if ($extract === true) $extract = $text;
 159         $this->assertIdentical($extract, $result);
 160     }
 161
 162     public function test_extractBody_noBodyTags()
 163     {
 164         $this->assertExtractBody('<b>Bold</b>');
 165     }
 166
 167     public function test_extractBody_lowercaseBodyTags()
 168     {
 169         $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
 170     }
 171
 172     public function test_extractBody_uppercaseBodyTags()
 173     {
 174         $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
 175     }
 176
 177     public function test_extractBody_realisticUseCase()
 178     {
 179         $this->assertExtractBody(
 180 '<?xml version="1.0"
 181 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 182     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 183 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 184    <head>
 185       <title>xyz</title>
 186    </head>
 187    <body>
 188       <form method="post" action="whatever1">
 189          <div>
 190             <input type="text" name="username" />
 191             <input type="text" name="password" />
 192             <input type="submit" />
 193          </div>
 194       </form>
 195    </body>
 196 </html>',
 197     '
 198       <form method="post" action="whatever1">
 199          <div>
 200             <input type="text" name="username" />
 201             <input type="text" name="password" />
 202             <input type="submit" />
 203          </div>
 204       </form>
 205    ');
 206     }
 207
 208     public function test_extractBody_bodyWithAttributes()
 209     {
 210         $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
 211     }
 212
 213     public function test_extractBody_preserveUnclosedBody()
 214     {
 215         $this->assertExtractBody('<body>asdf'); // not closed, don't accept
 216     }
 217
 218     public function test_extractBody_useLastBody()
 219     {
 220         $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
 221     }
 222
 223     public function test_extractBody_ignoreCommented()
 224     {
 225         $this->assertExtractBody('$<!-- <body>foo</body> -->^');
 226     }
 227
 228     public function test_extractBody_butCanStillWork()
 229     {
 230         $this->assertExtractBody('<!-- b --><body>a</body>', 'a');
 231     }
 232
 233     // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
 234
 235     public function assertTokenization($input, $expect, $alt_expect = array())
 236     {
 237         $lexers = array();
 238         $lexers['DirectLex']  = new HTMLPurifier_Lexer_DirectLex();
 239         if (class_exists('DOMDocument')) {
 240             $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
 241             $lexers['PH5P']   = new HTMLPurifier_Lexer_PH5P();
 242         }
 243         foreach ($lexers as $name => $lexer) {
 244             $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
 245             if (isset($alt_expect[$name])) {
 246                 if ($alt_expect[$name] === false) continue;
 247                 $t_expect = $alt_expect[$name];
 248                 $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
 249             } else {
 250                 $t_expect = $expect;
 251                 $this->assertIdentical($result, $expect, "$name: %s");
 252             }
 253             if ($t_expect != $result) {
 254                 printTokens($result);
 255             }
 256         }
 257     }
 258
 259     public function test_tokenizeHTML_emptyInput()
 260     {
 261         $this->assertTokenization('', array());
 262     }
 263
 264     public function test_tokenizeHTML_plainText()
 265     {
 266         $this->assertTokenization(
 267             'This is regular text.',
 268             array(
 269                 new HTMLPurifier_Token_Text('This is regular text.')
 270             )
 271         );
 272     }
 273
 274     public function test_tokenizeHTML_textAndTags()
 275     {
 276         $this->assertTokenization(
 277             'This is <b>bold</b> text',
 278             array(
 279                 new HTMLPurifier_Token_Text('This is '),
 280                 new HTMLPurifier_Token_Start('b', array()),
 281                 new HTMLPurifier_Token_Text('bold'),
 282                 new HTMLPurifier_Token_End('b'),
 283                 new HTMLPurifier_Token_Text(' text'),
 284             )
 285         );
 286     }
 287
 288     public function test_tokenizeHTML_normalizeCase()
 289     {
 290         $this->assertTokenization(
 291             '<DIV>Totally rad dude. <b>asdf</b></div>',
 292             array(
 293                 new HTMLPurifier_Token_Start('DIV', array()),
 294                 new HTMLPurifier_Token_Text('Totally rad dude. '),
 295                 new HTMLPurifier_Token_Start('b', array()),
 296                 new HTMLPurifier_Token_Text('asdf'),
 297                 new HTMLPurifier_Token_End('b'),
 298                 new HTMLPurifier_Token_End('div'),
 299             )
 300         );
 301     }
 302
 303     public function test_tokenizeHTML_notWellFormed()
 304     {
 305         $this->assertTokenization(
 306             '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
 307             array(
 308                 new HTMLPurifier_Token_Start('asdf'),
 309                 new HTMLPurifier_Token_End('asdf'),
 310                 new HTMLPurifier_Token_Start('d'),
 311                 new HTMLPurifier_Token_End('d'),
 312                 new HTMLPurifier_Token_Start('poOloka'),
 313                 new HTMLPurifier_Token_Start('poolasdf'),
 314                 new HTMLPurifier_Token_Start('ds'),
 315                 new HTMLPurifier_Token_End('asdf'),
 316                 new HTMLPurifier_Token_End('ASDF'),
 317             ),
 318             array(
 319                 'DOMLex' => $alt = array(
 320                     new HTMLPurifier_Token_Empty('asdf'),
 321                     new HTMLPurifier_Token_Empty('d'),
 322                     new HTMLPurifier_Token_Start('pooloka'),
 323                     new HTMLPurifier_Token_Start('poolasdf'),
 324                     new HTMLPurifier_Token_Empty('ds'),
 325                     new HTMLPurifier_Token_End('poolasdf'),
 326                     new HTMLPurifier_Token_End('pooloka'),
 327                 ),
 328                 // 20140831: Weird, but whatever...
 329                 'PH5P' => array(new HTMLPurifier_Token_Empty('asdf')),
 330             )
 331         );
 332     }
 333
 334     public function test_tokenizeHTML_whitespaceInTag()
 335     {
 336         $this->assertTokenization(
 337             '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
 338             array(
 339                 new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
 340                 new HTMLPurifier_Token_Text('Link to '),
 341                 new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
 342                 new HTMLPurifier_Token_Text('foobar'),
 343                 new HTMLPurifier_Token_End('b'),
 344                 new HTMLPurifier_Token_End('a'),
 345             )
 346         );
 347     }
 348
 349     public function test_tokenizeHTML_singleAttribute()
 350     {
 351         $this->assertTokenization(
 352             '<br style="&amp;" />',
 353             array(
 354                 new HTMLPurifier_Token_Empty('br', array('style' => '&'))
 355             )
 356         );
 357     }
 358
 359     public function test_tokenizeHTML_emptyTag()
 360     {
 361         $this->assertTokenization(
 362             '<br />',
 363             array( new HTMLPurifier_Token_Empty('br') )
 364         );
 365     }
 366
 367     public function test_tokenizeHTML_comment()
 368     {
 369         $this->assertTokenization(
 370             '<!-- Comment -->',
 371             array( new HTMLPurifier_Token_Comment(' Comment ') )
 372         );
 373     }
 374
 375     public function test_tokenizeHTML_malformedComment()
 376     {
 377         $this->assertTokenization(
 378             '<!-- not so well formed --->',
 379             array( new HTMLPurifier_Token_Comment(' not so well formed -') )
 380         );
 381     }
 382
 383     public function test_tokenizeHTML_unterminatedTag()
 384     {
 385         $this->assertTokenization(
 386             '<a href=""',
 387             array( new HTMLPurifier_Token_Text('<a href=""') ),
 388             array(
 389                 // I like our behavior better, but it's non-standard
 390                 'DOMLex'   => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
 391                 'PH5P' => false, // total barfing, grabs scaffolding too
 392             )
 393         );
 394     }
 395
 396     public function test_tokenizeHTML_specialEntities()
 397     {
 398         $this->assertTokenization(
 399             '&lt;b&gt;',
 400             array(
 401                 new HTMLPurifier_Token_Text('<b>')
 402             ),
 403             array(
 404                 // some parsers will separate entities out
 405                 'PH5P' => array(
 406                     new HTMLPurifier_Token_Text('<'),
 407                     new HTMLPurifier_Token_Text('b'),
 408                     new HTMLPurifier_Token_Text('>'),
 409                 ),
 410             )
 411         );
 412     }
 413
 414     public function test_tokenizeHTML_earlyQuote()
 415     {
 416         $this->assertTokenization(
 417             '<a "=>',
 418             array( new HTMLPurifier_Token_Empty('a') ),
 419             array(
 420                 // we barf on this input
 421                 'DirectLex' => array(
 422                     new HTMLPurifier_Token_Start('a', array('"' => ''))
 423                 ),
 424                 'PH5P' => false, // behavior varies; handle this personally
 425             )
 426         );
 427     }
 428
 429     public function test_tokenizeHTML_earlyQuote_PH5P()
 430     {
 431         if (!class_exists('DOMDocument')) return;
 432         $lexer = new HTMLPurifier_Lexer_PH5P();
 433         $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
 434         if ($this->context->get('PH5PError', true)) {
 435             $this->assertIdentical(array(
 436                 new HTMLPurifier_Token_Start('a', array('"' => ''))
 437             ), $result);
 438         } else {
 439             $this->assertIdentical(array(
 440                 new HTMLPurifier_Token_Empty('a', array('"' => ''))
 441             ), $result);
 442         }
 443     }
 444
 445     public function test_tokenizeHTML_unescapedQuote()
 446     {
 447         $this->assertTokenization(
 448             '"',
 449             array( new HTMLPurifier_Token_Text('"') )
 450         );
 451     }
 452
 453     public function test_tokenizeHTML_escapedQuote()
 454     {
 455         $this->assertTokenization(
 456             '&quot;',
 457             array( new HTMLPurifier_Token_Text('"') )
 458         );
 459     }
 460
 461     public function test_tokenizeHTML_cdata()
 462     {
 463         $this->assertTokenization(
 464             '<![CDATA[You <b>can&#39;t</b> get me!]]>',
 465             array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
 466             array(
 467                 'PH5P' =>  array(
 468                     new HTMLPurifier_Token_Text('You '),
 469                     new HTMLPurifier_Token_Text('<'),
 470                     new HTMLPurifier_Token_Text('b'),
 471                     new HTMLPurifier_Token_Text('>'),
 472                     new HTMLPurifier_Token_Text('can'),
 473                     new HTMLPurifier_Token_Text('&'),
 474                     new HTMLPurifier_Token_Text('#39;t'),
 475                     new HTMLPurifier_Token_Text('<'),
 476                     new HTMLPurifier_Token_Text('/b'),
 477                     new HTMLPurifier_Token_Text('>'),
 478                     new HTMLPurifier_Token_Text(' get me!'),
 479                 ),
 480             )
 481         );
 482     }
 483
 484     public function test_tokenizeHTML_characterEntity()
 485     {
 486         $this->assertTokenization(
 487             '&theta;',
 488             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 489         );
 490     }
 491
 492     public function test_tokenizeHTML_characterEntityInCDATA()
 493     {
 494         $this->assertTokenization(
 495             '<![CDATA[&rarr;]]>',
 496             array( new HTMLPurifier_Token_Text("&rarr;") ),
 497             array(
 498                 'PH5P' => array(
 499                     new HTMLPurifier_Token_Text('&'),
 500                     new HTMLPurifier_Token_Text('rarr;'),
 501                 ),
 502             )
 503         );
 504     }
 505
 506     public function test_tokenizeHTML_entityInAttribute()
 507     {
 508         $this->assertTokenization(
 509             '<a href="index.php?title=foo&amp;id=bar">Link</a>',
 510             array(
 511                 new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
 512                 new HTMLPurifier_Token_Text('Link'),
 513                 new HTMLPurifier_Token_End('a'),
 514             )
 515         );
 516     }
 517
 518     public function test_tokenizeHTML_preserveUTF8()
 519     {
 520         $this->assertTokenization(
 521             "\xCE\xB8",
 522             array( new HTMLPurifier_Token_Text("\xCE\xB8") )
 523         );
 524     }
 525
 526     public function test_tokenizeHTML_specialEntityInAttribute()
 527     {
 528         $this->assertTokenization(
 529             '<br test="x &lt; 6" />',
 530             array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
 531         );
 532     }
 533
 534     public function test_tokenizeHTML_emoticonProtection()
 535     {
 536         $this->assertTokenization(
 537             '<b>Whoa! <3 That\'s not good >.></b>',
 538             array(
 539                 new HTMLPurifier_Token_Start('b'),
 540                 new HTMLPurifier_Token_Text('Whoa! '),
 541                 new HTMLPurifier_Token_Text('<'),
 542                 new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 543                 new HTMLPurifier_Token_End('b')
 544             ),
 545             array(
 546                 // text is absorbed together
 547                 'DOMLex' => array(
 548                     new HTMLPurifier_Token_Start('b'),
 549                     new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
 550                     new HTMLPurifier_Token_End('b'),
 551                 ),
 552                 'PH5P' => array( // interesting grouping
 553                     new HTMLPurifier_Token_Start('b'),
 554                     new HTMLPurifier_Token_Text('Whoa! '),
 555                     new HTMLPurifier_Token_Text('<'),
 556                     new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
 557                     new HTMLPurifier_Token_End('b'),
 558                 ),
 559             )
 560         );
 561     }
 562
 563     public function test_tokenizeHTML_commentWithFunkyChars()
 564     {
 565         $this->assertTokenization(
 566             '<!-- This >< comment --><br />',
 567             array(
 568                 new HTMLPurifier_Token_Comment(' This >< comment '),
 569                 new HTMLPurifier_Token_Empty('br'),
 570             )
 571         );
 572     }
 573
 574     public function test_tokenizeHTML_unterminatedComment()
 575     {
 576         $this->assertTokenization(
 577             '<!-- This >< comment',
 578             array( new HTMLPurifier_Token_Comment(' This >< comment') ),
 579             array(
 580                 'DOMLex'   => false,
 581                 'PH5P'     => false,
 582             )
 583         );
 584     }
 585
 586     public function test_tokenizeHTML_scriptCDATAContents()
 587     {
 588         $this->config->set('HTML.Trusted', true);
 589         $this->assertTokenization(
 590             'Foo: <script>alert("<foo>");</script>',
 591             array(
 592                 new HTMLPurifier_Token_Text('Foo: '),
 593                 new HTMLPurifier_Token_Start('script'),
 594                 new HTMLPurifier_Token_Text('alert("<foo>");'),
 595                 new HTMLPurifier_Token_End('script'),
 596             ),
 597             array(
 598                 // PH5P, for some reason, bubbles the script to <head>
 599                 'PH5P' => false,
 600             )
 601         );
 602     }
 603
 604     public function test_tokenizeHTML_entitiesInComment()
 605     {
 606         $this->assertTokenization(
 607             '<!-- This comment < &lt; & -->',
 608             array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
 609         );
 610     }
 611
 612     public function test_tokenizeHTML_attributeWithSpecialCharacters()
 613     {
 614         $this->assertTokenization(
 615             '<a href="><>">',
 616             array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
 617             array(
 618                 'DirectLex' => array(
 619                     new HTMLPurifier_Token_Start('a', array('href' => '')),
 620                     new HTMLPurifier_Token_Text('<'),
 621                     new HTMLPurifier_Token_Text('">'),
 622                 )
 623             )
 624         );
 625     }
 626
 627     public function test_tokenizeHTML_emptyTagWithSlashInAttribute()
 628     {
 629         $this->assertTokenization(
 630             '<param name="src" value="http://example.com/video.wmv" />',
 631             array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
 632         );
 633     }
 634
 635     public function test_tokenizeHTML_style()
 636     {
 637         $extra = array(
 638                 // PH5P doesn't seem to like style tags
 639                 'PH5P' => false,
 640                 // DirectLex defers to RemoveForeignElements for textification
 641                 'DirectLex' => array(
 642                     new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 643                     new HTMLPurifier_Token_Comment("\ndiv {}\n"),
 644                     new HTMLPurifier_Token_End('style'),
 645                 ),
 646             );
 647         if (!defined('LIBXML_VERSION')) {
 648             // LIBXML_VERSION is missing in early versions of PHP
 649             // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
 650             // this translates to 5.0.x. In such cases, punt the test entirely.
 651             return;
 652         } elseif (LIBXML_VERSION < 20628) {
 653             // libxml's behavior is wrong prior to this version, so make
 654             // appropriate accomodations
 655             $extra['DOMLex'] = $extra['DirectLex'];
 656         }
 657         $this->assertTokenization(
 658 '<style type="text/css"><!--
 659 div {}
 660 --></style>',
 661             array(
 662                 new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
 663                 new HTMLPurifier_Token_Text("\ndiv {}\n"),
 664                 new HTMLPurifier_Token_End('style'),
 665             ),
 666             $extra
 667         );
 668     }
 669
 670     public function test_tokenizeHTML_tagWithAtSignAndExtraGt()
 671     {
 672         $alt_expect = array(
 673             // Technically this is invalid, but it won't be a
 674             // problem with invalid element removal; also, this
 675             // mimics Mozilla's parsing of the tag.
 676             new HTMLPurifier_Token_Start('a@'),
 677             new HTMLPurifier_Token_Text('>'),
 678         );
 679         $this->assertTokenization(
 680             '<a@>>',
 681             array(
 682                 new HTMLPurifier_Token_Start('a'),
 683                 new HTMLPurifier_Token_Text('>'),
 684                 new HTMLPurifier_Token_End('a'),
 685             ),
 686             array(
 687                 'DirectLex' => $alt_expect,
 688             )
 689         );
 690     }
 691
 692     public function test_tokenizeHTML_emoticonHeart()
 693     {
 694         $this->assertTokenization(
 695             '<br /><3<br />',
 696             array(
 697                 new HTMLPurifier_Token_Empty('br'),
 698                 new HTMLPurifier_Token_Text('<'),
 699                 new HTMLPurifier_Token_Text('3'),
 700                 new HTMLPurifier_Token_Empty('br'),
 701             ),
 702             array(
 703                 'DOMLex' => array(
 704                     new HTMLPurifier_Token_Empty('br'),
 705                     new HTMLPurifier_Token_Text('<3'),
 706                     new HTMLPurifier_Token_Empty('br'),
 707                 ),
 708             )
 709         );
 710     }
 711
 712     public function test_tokenizeHTML_emoticonShiftyEyes()
 713     {
 714         $this->assertTokenization(
 715             '<b><<</b>',
 716             array(
 717                 new HTMLPurifier_Token_Start('b'),
 718                 new HTMLPurifier_Token_Text('<'),
 719                 new HTMLPurifier_Token_Text('<'),
 720                 new HTMLPurifier_Token_End('b'),
 721             ),
 722             array(
 723                 'DOMLex' => array(
 724                     new HTMLPurifier_Token_Start('b'),
 725                     new HTMLPurifier_Token_Text('<<'),
 726                     new HTMLPurifier_Token_End('b'),
 727                 ),
 728             )
 729         );
 730     }
 731
 732     public function test_tokenizeHTML_eon1996()
 733     {
 734         $this->assertTokenization(
 735             '< <b>test</b>',
 736             array(
 737                 new HTMLPurifier_Token_Text('<'),
 738                 new HTMLPurifier_Token_Text(' '),
 739                 new HTMLPurifier_Token_Start('b'),
 740                 new HTMLPurifier_Token_Text('test'),
 741                 new HTMLPurifier_Token_End('b'),
 742             ),
 743             array(
 744                 'DOMLex' => array(
 745                     new HTMLPurifier_Token_Text('< '),
 746                     new HTMLPurifier_Token_Start('b'),
 747                     new HTMLPurifier_Token_Text('test'),
 748                     new HTMLPurifier_Token_End('b'),
 749                 ),
 750             )
 751         );
 752     }
 753
 754     public function test_tokenizeHTML_bodyInCDATA()
 755     {
 756         $alt_tokens = array(
 757             new HTMLPurifier_Token_Text('<'),
 758             new HTMLPurifier_Token_Text('body'),
 759             new HTMLPurifier_Token_Text('>'),
 760             new HTMLPurifier_Token_Text('Foo'),
 761             new HTMLPurifier_Token_Text('<'),
 762             new HTMLPurifier_Token_Text('/body'),
 763             new HTMLPurifier_Token_Text('>'),
 764         );
 765         $this->assertTokenization(
 766             '<![CDATA[<body>Foo</body>]]>',
 767             array(
 768                 new HTMLPurifier_Token_Text('<body>Foo</body>'),
 769             ),
 770             array(
 771                 'PH5P' => $alt_tokens,
 772             )
 773         );
 774     }
 775
 776     public function test_tokenizeHTML_()
 777     {
 778         $this->assertTokenization(
 779             '<a><img /></a>',
 780             array(
 781                 new HTMLPurifier_Token_Start('a'),
 782                 new HTMLPurifier_Token_Empty('img'),
 783                 new HTMLPurifier_Token_End('a'),
 784             )
 785         );
 786     }
 787
 788     public function test_tokenizeHTML_ignoreIECondComment()
 789     {
 790         $this->assertTokenization(
 791             '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
 792             array()
 793         );
 794     }
 795
 796     public function test_tokenizeHTML_removeProcessingInstruction()
 797     {
 798         $this->config->set('Core.RemoveProcessingInstructions', true);
 799         $this->assertTokenization(
 800             '<?xml blah blah ?>',
 801             array()
 802         );
 803     }
 804
 805    public function test_tokenizeHTML_removeNewline()
 806    {
 807         $this->config->set('Core.NormalizeNewlines', true);
 808         $this->assertTokenization(
 809             "plain\rtext\r\n",
 810             array(
 811                 new HTMLPurifier_Token_Text("plain\ntext\n")
 812             )
 813         );
 814    }
 815
 816    public function test_tokenizeHTML_noRemoveNewline()
 817    {
 818         $this->config->set('Core.NormalizeNewlines', false);
 819         $this->assertTokenization(
 820             "plain\rtext\r\n",
 821             array(
 822                 new HTMLPurifier_Token_Text("plain\rtext\r\n")
 823             )
 824         );
 825      }
 826
 827     public function test_tokenizeHTML_conditionalCommentUngreedy()
 828     {
 829         $this->assertTokenization(
 830             '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
 831             array(
 832                 new HTMLPurifier_Token_Text("b")
 833             )
 834         );
 835     }
 836
 837     public function test_tokenizeHTML_imgTag()
 838     {
 839         $start = array(
 840                         new HTMLPurifier_Token_Start('img',
 841                             array(
 842                                 'src' => 'img_11775.jpg',
 843                                 'alt' => '[Img #11775]',
 844                                 'id' => 'EMBEDDED_IMG_11775',
 845                             )
 846                         )
 847                     );
 848         $this->assertTokenization(
 849             '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
 850             array(
 851                 new HTMLPurifier_Token_Empty('img',
 852                     array(
 853                         'src' => 'img_11775.jpg',
 854                         'alt' => '[Img #11775]',
 855                         'id' => 'EMBEDDED_IMG_11775',
 856                     )
 857                 )
 858             ),
 859             array(
 860                 'DirectLex' => $start,
 861                 )
 862         );
 863     }
 864
 865     public function test_tokenizeHTML_prematureDivClose()
 866     {
 867         $this->assertTokenization(
 868             '</div>dont<b>die</b>',
 869             array(
 870                 new HTMLPurifier_Token_End('div'),
 871                 new HTMLPurifier_Token_Text('dont'),
 872                 new HTMLPurifier_Token_Start('b'),
 873                 new HTMLPurifier_Token_Text('die'),
 874                 new HTMLPurifier_Token_End('b'),
 875             ),
 876             array(
 877                 'DOMLex' => $alt = array(
 878                     new HTMLPurifier_Token_Text('dont'),
 879                     new HTMLPurifier_Token_Start('b'),
 880                     new HTMLPurifier_Token_Text('die'),
 881                     new HTMLPurifier_Token_End('b')
 882                 ),
 883                 'PH5P' => $alt
 884             )
 885         );
 886     }
 887
 888
 889     /*
 890
 891     public function test_tokenizeHTML_()
 892     {
 893         $this->assertTokenization(
 894             ,
 895             array(
 896
 897             )
 898         );
 899     }
 900     */
 901
 902 }
 903
 904 // vim: et sw=4 sts=4