vendor/dompdf/dompdf/lib/html5lib/Tokenizer.php

   1 <?php
   2
   3 /*
   4
   5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
   6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
   7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
   8
   9 Permission is hereby granted, free of charge, to any person obtaining a
  10 copy of this software and associated documentation files (the
  11 "Software"), to deal in the Software without restriction, including
  12 without limitation the rights to use, copy, modify, merge, publish,
  13 distribute, sublicense, and/or sell copies of the Software, and to
  14 permit persons to whom the Software is furnished to do so, subject to
  15 the following conditions:
  16
  17 The above copyright notice and this permission notice shall be included
  18 in all copies or substantial portions of the Software.
  19
  20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27
  28 */
  29
  30 // Some conventions:
  31 // /* */ indicates verbatim text from the HTML 5 specification
  32 // // indicates regular comments
  33
  34 // all flags are in hyphenated form
  35
  36 class HTML5_Tokenizer {
  37     /**
  38      * @var HTML5_InputStream
  39      *
  40      * Points to an InputStream object.
  41      */
  42     protected $stream;
  43
  44     /**
  45      * @var HTML5_TreeBuilder
  46      *
  47      * Tree builder that the tokenizer emits token to.
  48      */
  49     private $tree;
  50
  51     /**
  52      * @var int
  53      *
  54      * Current content model we are parsing as.
  55      */
  56     protected $content_model;
  57
  58     /**
  59      * Current token that is being built, but not yet emitted. Also
  60      * is the last token emitted, if applicable.
  61      */
  62     protected $token;
  63
  64     // These are constants describing the content model
  65     const PCDATA    = 0;
  66     const RCDATA    = 1;
  67     const CDATA     = 2;
  68     const PLAINTEXT = 3;
  69
  70     // These are constants describing tokens
  71     // XXX should probably be moved somewhere else, probably the
  72     // HTML5 class.
  73     const DOCTYPE        = 0;
  74     const STARTTAG       = 1;
  75     const ENDTAG         = 2;
  76     const COMMENT        = 3;
  77     const CHARACTER      = 4;
  78     const SPACECHARACTER = 5;
  79     const EOF            = 6;
  80     const PARSEERROR     = 7;
  81
  82     // These are constants representing bunches of characters.
  83     const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
  84     const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
  85     const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
  86     const DIGIT       = '0123456789';
  87     const HEX         = '0123456789ABCDEFabcdef';
  88     const WHITESPACE  = "\t\n\x0c ";
  89
  90     /**
  91      * @param $data | Data to parse
  92      * @param HTML5_TreeBuilder|null $builder
  93      */
  94     public function __construct($data, $builder = null) {
  95         $this->stream = new HTML5_InputStream($data);
  96         if (!$builder) {
  97             $this->tree = new HTML5_TreeBuilder;
  98         } else {
  99             $this->tree = $builder;
 100         }
 101         $this->content_model = self::PCDATA;
 102     }
 103
 104     /**
 105      * @param null $context
 106      */
 107     public function parseFragment($context = null) {
 108         $this->tree->setupContext($context);
 109         if ($this->tree->content_model) {
 110             $this->content_model = $this->tree->content_model;
 111             $this->tree->content_model = null;
 112         }
 113         $this->parse();
 114     }
 115
 116     // XXX maybe convert this into an iterator? regardless, this function
 117     // and the save function should go into a Parser facade of some sort
 118     /**
 119      * Performs the actual parsing of the document.
 120      */
 121     public function parse() {
 122         // Current state
 123         $state = 'data';
 124         // This is used to avoid having to have look-behind in the data state.
 125         $lastFourChars = '';
 126         /**
 127          * Escape flag as specified by the HTML5 specification: "used to
 128          * control the behavior of the tokeniser. It is either true or
 129          * false, and initially must be set to the false state."
 130          */
 131         $escape = false;
 132         //echo "\n\n";
 133         while($state !== null) {
 134
 135             /*echo $state . ' ';
 136             switch ($this->content_model) {
 137                 case self::PCDATA: echo 'PCDATA'; break;
 138                 case self::RCDATA: echo 'RCDATA'; break;
 139                 case self::CDATA: echo 'CDATA'; break;
 140                 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
 141             }
 142             if ($escape) echo " escape";
 143             echo "\n";*/
 144
 145             switch($state) {
 146                 case 'data':
 147
 148                     /* Consume the next input character */
 149                     $char = $this->stream->char();
 150                     $lastFourChars .= $char;
 151                     if (strlen($lastFourChars) > 4) {
 152                         $lastFourChars = substr($lastFourChars, -4);
 153                     }
 154
 155                     // see below for meaning
 156                     $hyp_cond =
 157                         !$escape &&
 158                         (
 159                             $this->content_model === self::RCDATA ||
 160                             $this->content_model === self::CDATA
 161                         );
 162                     $amp_cond =
 163                         !$escape &&
 164                         (
 165                             $this->content_model === self::PCDATA ||
 166                             $this->content_model === self::RCDATA
 167                         );
 168                     $lt_cond =
 169                         $this->content_model === self::PCDATA ||
 170                         (
 171                             (
 172                                 $this->content_model === self::RCDATA ||
 173                                 $this->content_model === self::CDATA
 174                              ) &&
 175                              !$escape
 176                         );
 177                     $gt_cond =
 178                         $escape &&
 179                         (
 180                             $this->content_model === self::RCDATA ||
 181                             $this->content_model === self::CDATA
 182                         );
 183
 184                     if ($char === '&' && $amp_cond === true) {
 185                         /* U+0026 AMPERSAND (&)
 186                         When the content model flag is set to one of the PCDATA or RCDATA
 187                         states and the escape flag is false: switch to the
 188                         character reference data state. Otherwise: treat it as per
 189                         the "anything else" entry below. */
 190                         $state = 'character reference data';
 191
 192                     } elseif (
 193                         $char === '-' &&
 194                         $hyp_cond === true &&
 195                         $lastFourChars === '<!--'
 196                     ) {
 197                         /*
 198                         U+002D HYPHEN-MINUS (-)
 199                         If the content model flag is set to either the RCDATA state or
 200                         the CDATA state, and the escape flag is false, and there are at
 201                         least three characters before this one in the input stream, and the
 202                         last four characters in the input stream, including this one, are
 203                         U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
 204                         and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 205                         $escape = true;
 206
 207                         /* In any case, emit the input character as a character token. Stay
 208                         in the data state. */
 209                         $this->emitToken(array(
 210                             'type' => self::CHARACTER,
 211                             'data' => '-'
 212                         ));
 213                         // We do the "any case" part as part of "anything else".
 214
 215                     /* U+003C LESS-THAN SIGN (<) */
 216                     } elseif ($char === '<' && $lt_cond === true) {
 217                         /* When the content model flag is set to the PCDATA state: switch
 218                         to the tag open state.
 219
 220                         When the content model flag is set to either the RCDATA state or
 221                         the CDATA state and the escape flag is false: switch to the tag
 222                         open state.
 223
 224                         Otherwise: treat it as per the "anything else" entry below. */
 225                         $state = 'tag open';
 226
 227                     /* U+003E GREATER-THAN SIGN (>) */
 228                     } elseif (
 229                         $char === '>' &&
 230                         $gt_cond === true &&
 231                         substr($lastFourChars, 1) === '-->'
 232                     ) {
 233                         /* If the content model flag is set to either the RCDATA state or
 234                         the CDATA state, and the escape flag is true, and the last three
 235                         characters in the input stream including this one are U+002D
 236                         HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
 237                         set the escape flag to false. */
 238                         $escape = false;
 239
 240                         /* In any case, emit the input character as a character token.
 241                         Stay in the data state. */
 242                         $this->emitToken(array(
 243                             'type' => self::CHARACTER,
 244                             'data' => '>'
 245                         ));
 246                         // We do the "any case" part as part of "anything else".
 247
 248                     } elseif ($char === false) {
 249                         /* EOF
 250                         Emit an end-of-file token. */
 251                         $state = null;
 252                         $this->tree->emitToken(array(
 253                             'type' => self::EOF
 254                         ));
 255
 256                     } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 257                         // Directly after emitting a token you switch back to the "data
 258                         // state". At that point spaceCharacters are important so they are
 259                         // emitted separately.
 260                         $chars = $this->stream->charsWhile(self::WHITESPACE);
 261                         $this->emitToken(array(
 262                             'type' => self::SPACECHARACTER,
 263                             'data' => $char . $chars
 264                         ));
 265                         $lastFourChars .= $chars;
 266                         if (strlen($lastFourChars) > 4) {
 267                             $lastFourChars = substr($lastFourChars, -4);
 268                         }
 269                     } else {
 270                         /* Anything else
 271                         THIS IS AN OPTIMIZATION: Get as many character that
 272                         otherwise would also be treated as a character token and emit it
 273                         as a single character token. Stay in the data state. */
 274
 275                         $mask = '';
 276                         if ($hyp_cond === true) {
 277                             $mask .= '-';
 278                         }
 279                         if ($amp_cond === true) {
 280                             $mask .= '&';
 281                         }
 282                         if ($lt_cond === true) {
 283                             $mask .= '<';
 284                         }
 285                         if ($gt_cond === true) {
 286                             $mask .= '>';
 287                         }
 288
 289                         if ($mask === '') {
 290                             $chars = $this->stream->remainingChars();
 291                         } else {
 292                             $chars = $this->stream->charsUntil($mask);
 293                         }
 294
 295                         $this->emitToken(array(
 296                             'type' => self::CHARACTER,
 297                             'data' => $char . $chars
 298                         ));
 299
 300                         $lastFourChars .= $chars;
 301                         if (strlen($lastFourChars) > 4) {
 302                             $lastFourChars = substr($lastFourChars, -4);
 303                         }
 304
 305                         $state = 'data';
 306                     }
 307                 break;
 308
 309                 case 'character reference data':
 310                     /* (This cannot happen if the content model flag
 311                     is set to the CDATA state.) */
 312
 313                     /* Attempt to consume a character reference, with no
 314                     additional allowed character. */
 315                     $entity = $this->consumeCharacterReference();
 316
 317                     /* If nothing is returned, emit a U+0026 AMPERSAND
 318                     character token. Otherwise, emit the character token that
 319                     was returned. */
 320                     // This is all done when consuming the character reference.
 321                     $this->emitToken(array(
 322                         'type' => self::CHARACTER,
 323                         'data' => $entity
 324                     ));
 325
 326                     /* Finally, switch to the data state. */
 327                     $state = 'data';
 328                 break;
 329
 330                 case 'tag open':
 331                     $char = $this->stream->char();
 332
 333                     switch ($this->content_model) {
 334                         case self::RCDATA:
 335                         case self::CDATA:
 336                             /* Consume the next input character. If it is a
 337                             U+002F SOLIDUS (/) character, switch to the close
 338                             tag open state. Otherwise, emit a U+003C LESS-THAN
 339                             SIGN character token and reconsume the current input
 340                             character in the data state. */
 341                             // We consumed above.
 342
 343                             if ($char === '/') {
 344                                 $state = 'close tag open';
 345                             } else {
 346                                 $this->emitToken(array(
 347                                     'type' => self::CHARACTER,
 348                                     'data' => '<'
 349                                 ));
 350
 351                                 $this->stream->unget();
 352
 353                                 $state = 'data';
 354                             }
 355                         break;
 356
 357                         case self::PCDATA:
 358                             /* If the content model flag is set to the PCDATA state
 359                             Consume the next input character: */
 360                             // We consumed above.
 361
 362                             if ($char === '!') {
 363                                 /* U+0021 EXCLAMATION MARK (!)
 364                                 Switch to the markup declaration open state. */
 365                                 $state = 'markup declaration open';
 366
 367                             } elseif ($char === '/') {
 368                                 /* U+002F SOLIDUS (/)
 369                                 Switch to the close tag open state. */
 370                                 $state = 'close tag open';
 371
 372                             } elseif ('A' <= $char && $char <= 'Z') {
 373                                 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 374                                 Create a new start tag token, set its tag name to the lowercase
 375                                 version of the input character (add 0x0020 to the character's code
 376                                 point), then switch to the tag name state. (Don't emit the token
 377                                 yet; further details will be filled in before it is emitted.) */
 378                                 $this->token = array(
 379                                     'name'  => strtolower($char),
 380                                     'type'  => self::STARTTAG,
 381                                     'attr'  => array()
 382                                 );
 383
 384                                 $state = 'tag name';
 385
 386                             } elseif ('a' <= $char && $char <= 'z') {
 387                                 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 388                                 Create a new start tag token, set its tag name to the input
 389                                 character, then switch to the tag name state. (Don't emit
 390                                 the token yet; further details will be filled in before it
 391                                 is emitted.) */
 392                                 $this->token = array(
 393                                     'name'  => $char,
 394                                     'type'  => self::STARTTAG,
 395                                     'attr'  => array()
 396                                 );
 397
 398                                 $state = 'tag name';
 399
 400                             } elseif ($char === '>') {
 401                                 /* U+003E GREATER-THAN SIGN (>)
 402                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
 403                                 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 404                                 $this->emitToken(array(
 405                                     'type' => self::PARSEERROR,
 406                                     'data' => 'expected-tag-name-but-got-right-bracket'
 407                                 ));
 408                                 $this->emitToken(array(
 409                                     'type' => self::CHARACTER,
 410                                     'data' => '<>'
 411                                 ));
 412
 413                                 $state = 'data';
 414
 415                             } elseif ($char === '?') {
 416                                 /* U+003F QUESTION MARK (?)
 417                                 Parse error. Switch to the bogus comment state. */
 418                                 $this->emitToken(array(
 419                                     'type' => self::PARSEERROR,
 420                                     'data' => 'expected-tag-name-but-got-question-mark'
 421                                 ));
 422                                 $this->token = array(
 423                                     'data' => '?',
 424                                     'type' => self::COMMENT
 425                                 );
 426                                 $state = 'bogus comment';
 427
 428                             } else {
 429                                 /* Anything else
 430                                 Parse error. Emit a U+003C LESS-THAN SIGN character token and
 431                                 reconsume the current input character in the data state. */
 432                                 $this->emitToken(array(
 433                                     'type' => self::PARSEERROR,
 434                                     'data' => 'expected-tag-name'
 435                                 ));
 436                                 $this->emitToken(array(
 437                                     'type' => self::CHARACTER,
 438                                     'data' => '<'
 439                                 ));
 440
 441                                 $state = 'data';
 442                                 $this->stream->unget();
 443                             }
 444                         break;
 445                     }
 446                 break;
 447
 448                 case 'close tag open':
 449                     if (
 450                         $this->content_model === self::RCDATA ||
 451                         $this->content_model === self::CDATA
 452                     ) {
 453                         /* If the content model flag is set to the RCDATA or CDATA
 454                         states... */
 455                         $name = strtolower($this->stream->charsWhile(self::ALPHA));
 456                         $following = $this->stream->char();
 457                         $this->stream->unget();
 458                         if (
 459                             !$this->token ||
 460                             $this->token['name'] !== $name ||
 461                             $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
 462                         ) {
 463                             /* if no start tag token has ever been emitted by this instance
 464                             of the tokenizer (fragment case), or, if the next few
 465                             characters do not match the tag name of the last start tag
 466                             token emitted (compared in an ASCII case-insensitive manner),
 467                             or if they do but they are not immediately followed by one of
 468                             the following characters:
 469
 470                                 * U+0009 CHARACTER TABULATION
 471                                 * U+000A LINE FEED (LF)
 472                                 * U+000C FORM FEED (FF)
 473                                 * U+0020 SPACE
 474                                 * U+003E GREATER-THAN SIGN (>)
 475                                 * U+002F SOLIDUS (/)
 476                                 * EOF
 477
 478                             ...then emit a U+003C LESS-THAN SIGN character token, a
 479                             U+002F SOLIDUS character token, and switch to the data
 480                             state to process the next input character. */
 481                             // XXX: Probably ought to replace in_array with $following === x ||...
 482
 483                             // We also need to emit $name now we've consumed that, as we
 484                             // know it'll just be emitted as a character token.
 485                             $this->emitToken(array(
 486                                 'type' => self::CHARACTER,
 487                                 'data' => '</' . $name
 488                             ));
 489
 490                             $state = 'data';
 491                         } else {
 492                             // This matches what would happen if we actually did the
 493                             // otherwise below (but we can't because we've consumed too
 494                             // much).
 495
 496                             // Start the end tag token with the name we already have.
 497                             $this->token = array(
 498                                 'name'  => $name,
 499                                 'type'  => self::ENDTAG
 500                             );
 501
 502                             // Change to tag name state.
 503                             $state = 'tag name';
 504                         }
 505                     } elseif ($this->content_model === self::PCDATA) {
 506                         /* Otherwise, if the content model flag is set to the PCDATA
 507                         state [...]: */
 508                         $char = $this->stream->char();
 509
 510                         if ('A' <= $char && $char <= 'Z') {
 511                             /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 512                             Create a new end tag token, set its tag name to the lowercase version
 513                             of the input character (add 0x0020 to the character's code point), then
 514                             switch to the tag name state. (Don't emit the token yet; further details
 515                             will be filled in before it is emitted.) */
 516                             $this->token = array(
 517                                 'name'  => strtolower($char),
 518                                 'type'  => self::ENDTAG
 519                             );
 520
 521                             $state = 'tag name';
 522
 523                         } elseif ('a' <= $char && $char <= 'z') {
 524                             /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
 525                             Create a new end tag token, set its tag name to the
 526                             input character, then switch to the tag name state.
 527                             (Don't emit the token yet; further details will be
 528                             filled in before it is emitted.) */
 529                             $this->token = array(
 530                                 'name'  => $char,
 531                                 'type'  => self::ENDTAG
 532                             );
 533
 534                             $state = 'tag name';
 535
 536                         } elseif ($char === '>') {
 537                             /* U+003E GREATER-THAN SIGN (>)
 538                             Parse error. Switch to the data state. */
 539                             $this->emitToken(array(
 540                                 'type' => self::PARSEERROR,
 541                                 'data' => 'expected-closing-tag-but-got-right-bracket'
 542                             ));
 543                             $state = 'data';
 544
 545                         } elseif ($char === false) {
 546                             /* EOF
 547                             Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
 548                             SOLIDUS character token. Reconsume the EOF character in the data state. */
 549                             $this->emitToken(array(
 550                                 'type' => self::PARSEERROR,
 551                                 'data' => 'expected-closing-tag-but-got-eof'
 552                             ));
 553                             $this->emitToken(array(
 554                                 'type' => self::CHARACTER,
 555                                 'data' => '</'
 556                             ));
 557
 558                             $this->stream->unget();
 559                             $state = 'data';
 560
 561                         } else {
 562                             /* Parse error. Switch to the bogus comment state. */
 563                             $this->emitToken(array(
 564                                 'type' => self::PARSEERROR,
 565                                 'data' => 'expected-closing-tag-but-got-char'
 566                             ));
 567                             $this->token = array(
 568                                 'data' => $char,
 569                                 'type' => self::COMMENT
 570                             );
 571                             $state = 'bogus comment';
 572                         }
 573                     }
 574                 break;
 575
 576                 case 'tag name':
 577                     /* Consume the next input character: */
 578                     $char = $this->stream->char();
 579
 580                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 581                         /* U+0009 CHARACTER TABULATION
 582                         U+000A LINE FEED (LF)
 583                         U+000C FORM FEED (FF)
 584                         U+0020 SPACE
 585                         Switch to the before attribute name state. */
 586                         $state = 'before attribute name';
 587
 588                     } elseif ($char === '/') {
 589                         /* U+002F SOLIDUS (/)
 590                         Switch to the self-closing start tag state. */
 591                         $state = 'self-closing start tag';
 592
 593                     } elseif ($char === '>') {
 594                         /* U+003E GREATER-THAN SIGN (>)
 595                         Emit the current tag token. Switch to the data state. */
 596                         $this->emitToken($this->token);
 597                         $state = 'data';
 598
 599                     } elseif ('A' <= $char && $char <= 'Z') {
 600                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 601                         Append the lowercase version of the current input
 602                         character (add 0x0020 to the character's code point) to
 603                         the current tag token's tag name. Stay in the tag name state. */
 604                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 605
 606                         $this->token['name'] .= strtolower($char . $chars);
 607                         $state = 'tag name';
 608
 609                     } elseif ($char === false) {
 610                         /* EOF
 611                         Parse error. Reconsume the EOF character in the data state. */
 612                         $this->emitToken(array(
 613                             'type' => self::PARSEERROR,
 614                             'data' => 'eof-in-tag-name'
 615                         ));
 616
 617                         $this->stream->unget();
 618                         $state = 'data';
 619
 620                     } else {
 621                         /* Anything else
 622                         Append the current input character to the current tag token's tag name.
 623                         Stay in the tag name state. */
 624                         $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
 625
 626                         $this->token['name'] .= $char . $chars;
 627                         $state = 'tag name';
 628                     }
 629                 break;
 630
 631                 case 'before attribute name':
 632                     /* Consume the next input character: */
 633                     $char = $this->stream->char();
 634
 635                     // this conditional is optimized, check bottom
 636                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 637                         /* U+0009 CHARACTER TABULATION
 638                         U+000A LINE FEED (LF)
 639                         U+000C FORM FEED (FF)
 640                         U+0020 SPACE
 641                         Stay in the before attribute name state. */
 642                         $state = 'before attribute name';
 643
 644                     } elseif ($char === '/') {
 645                         /* U+002F SOLIDUS (/)
 646                         Switch to the self-closing start tag state. */
 647                         $state = 'self-closing start tag';
 648
 649                     } elseif ($char === '>') {
 650                         /* U+003E GREATER-THAN SIGN (>)
 651                         Emit the current tag token. Switch to the data state. */
 652                         $this->emitToken($this->token);
 653                         $state = 'data';
 654
 655                     } elseif ('A' <= $char && $char <= 'Z') {
 656                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 657                         Start a new attribute in the current tag token. Set that
 658                         attribute's name to the lowercase version of the current
 659                         input character (add 0x0020 to the character's code
 660                         point), and its value to the empty string. Switch to the
 661                         attribute name state.*/
 662                         $this->token['attr'][] = array(
 663                             'name'  => strtolower($char),
 664                             'value' => ''
 665                         );
 666
 667                         $state = 'attribute name';
 668
 669                     } elseif ($char === false) {
 670                         /* EOF
 671                         Parse error. Reconsume the EOF character in the data state. */
 672                         $this->emitToken(array(
 673                             'type' => self::PARSEERROR,
 674                             'data' => 'expected-attribute-name-but-got-eof'
 675                         ));
 676
 677                         $this->stream->unget();
 678                         $state = 'data';
 679
 680                     } else {
 681                         /* U+0022 QUOTATION MARK (")
 682                            U+0027 APOSTROPHE (')
 683                            U+003C LESS-THAN SIGN (<)
 684                            U+003D EQUALS SIGN (=)
 685                         Parse error. Treat it as per the "anything else" entry
 686                         below. */
 687                         if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
 688                             $this->emitToken(array(
 689                                 'type' => self::PARSEERROR,
 690                                 'data' => 'invalid-character-in-attribute-name'
 691                             ));
 692                         }
 693
 694                         /* Anything else
 695                         Start a new attribute in the current tag token. Set that attribute's
 696                         name to the current input character, and its value to the empty string.
 697                         Switch to the attribute name state. */
 698                         $this->token['attr'][] = array(
 699                             'name'  => $char,
 700                             'value' => ''
 701                         );
 702
 703                         $state = 'attribute name';
 704                     }
 705                 break;
 706
 707                 case 'attribute name':
 708                     // Consume the next input character:
 709                     $char = $this->stream->char();
 710
 711                     // this conditional is optimized, check bottom
 712                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 713                         /* U+0009 CHARACTER TABULATION
 714                         U+000A LINE FEED (LF)
 715                         U+000C FORM FEED (FF)
 716                         U+0020 SPACE
 717                         Switch to the after attribute name state. */
 718                         $state = 'after attribute name';
 719
 720                     } elseif ($char === '/') {
 721                         /* U+002F SOLIDUS (/)
 722                         Switch to the self-closing start tag state. */
 723                         $state = 'self-closing start tag';
 724
 725                     } elseif ($char === '=') {
 726                         /* U+003D EQUALS SIGN (=)
 727                         Switch to the before attribute value state. */
 728                         $state = 'before attribute value';
 729
 730                     } elseif ($char === '>') {
 731                         /* U+003E GREATER-THAN SIGN (>)
 732                         Emit the current tag token. Switch to the data state. */
 733                         $this->emitToken($this->token);
 734                         $state = 'data';
 735
 736                     } elseif ('A' <= $char && $char <= 'Z') {
 737                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 738                         Append the lowercase version of the current input
 739                         character (add 0x0020 to the character's code point) to
 740                         the current attribute's name. Stay in the attribute name
 741                         state. */
 742                         $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
 743
 744                         $last = count($this->token['attr']) - 1;
 745                         $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
 746
 747                         $state = 'attribute name';
 748
 749                     } elseif ($char === false) {
 750                         /* EOF
 751                         Parse error. Reconsume the EOF character in the data state. */
 752                         $this->emitToken(array(
 753                             'type' => self::PARSEERROR,
 754                             'data' => 'eof-in-attribute-name'
 755                         ));
 756
 757                         $this->stream->unget();
 758                         $state = 'data';
 759
 760                     } else {
 761                         /* U+0022 QUOTATION MARK (")
 762                            U+0027 APOSTROPHE (')
 763                            U+003C LESS-THAN SIGN (<)
 764                         Parse error. Treat it as per the "anything else"
 765                         entry below. */
 766                         if ($char === '"' || $char === "'" || $char === '<') {
 767                             $this->emitToken(array(
 768                                 'type' => self::PARSEERROR,
 769                                 'data' => 'invalid-character-in-attribute-name'
 770                             ));
 771                         }
 772
 773                         /* Anything else
 774                         Append the current input character to the current attribute's name.
 775                         Stay in the attribute name state. */
 776                         $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
 777
 778                         $last = count($this->token['attr']) - 1;
 779                         $this->token['attr'][$last]['name'] .= $char . $chars;
 780
 781                         $state = 'attribute name';
 782                     }
 783
 784                     /* When the user agent leaves the attribute name state
 785                     (and before emitting the tag token, if appropriate), the
 786                     complete attribute's name must be compared to the other
 787                     attributes on the same token; if there is already an
 788                     attribute on the token with the exact same name, then this
 789                     is a parse error and the new attribute must be dropped, along
 790                     with the value that gets associated with it (if any). */
 791                     // this might be implemented in the emitToken method
 792                 break;
 793
 794                 case 'after attribute name':
 795                     // Consume the next input character:
 796                     $char = $this->stream->char();
 797
 798                     // this is an optimized conditional, check the bottom
 799                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 800                         /* U+0009 CHARACTER TABULATION
 801                         U+000A LINE FEED (LF)
 802                         U+000C FORM FEED (FF)
 803                         U+0020 SPACE
 804                         Stay in the after attribute name state. */
 805                         $state = 'after attribute name';
 806
 807                     } elseif ($char === '/') {
 808                         /* U+002F SOLIDUS (/)
 809                         Switch to the self-closing start tag state. */
 810                         $state = 'self-closing start tag';
 811
 812                     } elseif ($char === '=') {
 813                         /* U+003D EQUALS SIGN (=)
 814                         Switch to the before attribute value state. */
 815                         $state = 'before attribute value';
 816
 817                     } elseif ($char === '>') {
 818                         /* U+003E GREATER-THAN SIGN (>)
 819                         Emit the current tag token. Switch to the data state. */
 820                         $this->emitToken($this->token);
 821                         $state = 'data';
 822
 823                     } elseif ('A' <= $char && $char <= 'Z') {
 824                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
 825                         Start a new attribute in the current tag token. Set that
 826                         attribute's name to the lowercase version of the current
 827                         input character (add 0x0020 to the character's code
 828                         point), and its value to the empty string. Switch to the
 829                         attribute name state. */
 830                         $this->token['attr'][] = array(
 831                             'name'  => strtolower($char),
 832                             'value' => ''
 833                         );
 834
 835                         $state = 'attribute name';
 836
 837                     } elseif ($char === false) {
 838                         /* EOF
 839                         Parse error. Reconsume the EOF character in the data state. */
 840                         $this->emitToken(array(
 841                             'type' => self::PARSEERROR,
 842                             'data' => 'expected-end-of-tag-but-got-eof'
 843                         ));
 844
 845                         $this->stream->unget();
 846                         $state = 'data';
 847
 848                     } else {
 849                         /* U+0022 QUOTATION MARK (")
 850                            U+0027 APOSTROPHE (')
 851                            U+003C LESS-THAN SIGN(<)
 852                         Parse error. Treat it as per the "anything else"
 853                         entry below. */
 854                         if ($char === '"' || $char === "'" || $char === "<") {
 855                             $this->emitToken(array(
 856                                 'type' => self::PARSEERROR,
 857                                 'data' => 'invalid-character-after-attribute-name'
 858                             ));
 859                         }
 860
 861                         /* Anything else
 862                         Start a new attribute in the current tag token. Set that attribute's
 863                         name to the current input character, and its value to the empty string.
 864                         Switch to the attribute name state. */
 865                         $this->token['attr'][] = array(
 866                             'name'  => $char,
 867                             'value' => ''
 868                         );
 869
 870                         $state = 'attribute name';
 871                     }
 872                 break;
 873
 874                 case 'before attribute value':
 875                     // Consume the next input character:
 876                     $char = $this->stream->char();
 877
 878                     // this is an optimized conditional
 879                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
 880                         /* U+0009 CHARACTER TABULATION
 881                         U+000A LINE FEED (LF)
 882                         U+000C FORM FEED (FF)
 883                         U+0020 SPACE
 884                         Stay in the before attribute value state. */
 885                         $state = 'before attribute value';
 886
 887                     } elseif ($char === '"') {
 888                         /* U+0022 QUOTATION MARK (")
 889                         Switch to the attribute value (double-quoted) state. */
 890                         $state = 'attribute value (double-quoted)';
 891
 892                     } elseif ($char === '&') {
 893                         /* U+0026 AMPERSAND (&)
 894                         Switch to the attribute value (unquoted) state and reconsume
 895                         this input character. */
 896                         $this->stream->unget();
 897                         $state = 'attribute value (unquoted)';
 898
 899                     } elseif ($char === '\'') {
 900                         /* U+0027 APOSTROPHE (')
 901                         Switch to the attribute value (single-quoted) state. */
 902                         $state = 'attribute value (single-quoted)';
 903
 904                     } elseif ($char === '>') {
 905                         /* U+003E GREATER-THAN SIGN (>)
 906                         Parse error. Emit the current tag token. Switch to the data state. */
 907                         $this->emitToken(array(
 908                             'type' => self::PARSEERROR,
 909                             'data' => 'expected-attribute-value-but-got-right-bracket'
 910                         ));
 911                         $this->emitToken($this->token);
 912                         $state = 'data';
 913
 914                     } elseif ($char === false) {
 915                         /* EOF
 916                         Parse error. Reconsume the EOF character in the data state. */
 917                         $this->emitToken(array(
 918                             'type' => self::PARSEERROR,
 919                             'data' => 'expected-attribute-value-but-got-eof'
 920                         ));
 921                         $this->stream->unget();
 922                         $state = 'data';
 923
 924                     } else {
 925                         /* U+003D EQUALS SIGN (=)
 926                          * U+003C LESS-THAN SIGN (<)
 927                         Parse error. Treat it as per the "anything else" entry below. */
 928                         if ($char === '=' || $char === '<') {
 929                             $this->emitToken(array(
 930                                 'type' => self::PARSEERROR,
 931                                 'data' => 'equals-in-unquoted-attribute-value'
 932                             ));
 933                         }
 934
 935                         /* Anything else
 936                         Append the current input character to the current attribute's value.
 937                         Switch to the attribute value (unquoted) state. */
 938                         $last = count($this->token['attr']) - 1;
 939                         $this->token['attr'][$last]['value'] .= $char;
 940
 941                         $state = 'attribute value (unquoted)';
 942                     }
 943                 break;
 944
 945                 case 'attribute value (double-quoted)':
 946                     // Consume the next input character:
 947                     $char = $this->stream->char();
 948
 949                     if ($char === '"') {
 950                         /* U+0022 QUOTATION MARK (")
 951                         Switch to the after attribute value (quoted) state. */
 952                         $state = 'after attribute value (quoted)';
 953
 954                     } elseif ($char === '&') {
 955                         /* U+0026 AMPERSAND (&)
 956                         Switch to the character reference in attribute value
 957                         state, with the additional allowed character
 958                         being U+0022 QUOTATION MARK ("). */
 959                         $this->characterReferenceInAttributeValue('"');
 960
 961                     } elseif ($char === false) {
 962                         /* EOF
 963                         Parse error. Reconsume the EOF character in the data state. */
 964                         $this->emitToken(array(
 965                             'type' => self::PARSEERROR,
 966                             'data' => 'eof-in-attribute-value-double-quote'
 967                         ));
 968
 969                         $this->stream->unget();
 970                         $state = 'data';
 971
 972                     } else {
 973                         /* Anything else
 974                         Append the current input character to the current attribute's value.
 975                         Stay in the attribute value (double-quoted) state. */
 976                         $chars = $this->stream->charsUntil('"&');
 977
 978                         $last = count($this->token['attr']) - 1;
 979                         $this->token['attr'][$last]['value'] .= $char . $chars;
 980
 981                         $state = 'attribute value (double-quoted)';
 982                     }
 983                 break;
 984
 985                 case 'attribute value (single-quoted)':
 986                     // Consume the next input character:
 987                     $char = $this->stream->char();
 988
 989                     if ($char === "'") {
 990                         /* U+0022 QUOTATION MARK (')
 991                         Switch to the after attribute value state. */
 992                         $state = 'after attribute value (quoted)';
 993
 994                     } elseif ($char === '&') {
 995                         /* U+0026 AMPERSAND (&)
 996                         Switch to the entity in attribute value state. */
 997                         $this->characterReferenceInAttributeValue("'");
 998
 999                     } elseif ($char === false) {
1000                         /* EOF
1001                         Parse error. Reconsume the EOF character in the data state. */
1002                         $this->emitToken(array(
1003                             'type' => self::PARSEERROR,
1004                             'data' => 'eof-in-attribute-value-single-quote'
1005                         ));
1006
1007                         $this->stream->unget();
1008                         $state = 'data';
1009
1010                     } else {
1011                         /* Anything else
1012                         Append the current input character to the current attribute's value.
1013                         Stay in the attribute value (single-quoted) state. */
1014                         $chars = $this->stream->charsUntil("'&");
1015
1016                         $last = count($this->token['attr']) - 1;
1017                         $this->token['attr'][$last]['value'] .= $char . $chars;
1018
1019                         $state = 'attribute value (single-quoted)';
1020                     }
1021                 break;
1022
1023                 case 'attribute value (unquoted)':
1024                     // Consume the next input character:
1025                     $char = $this->stream->char();
1026
1027                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1028                         /* U+0009 CHARACTER TABULATION
1029                         U+000A LINE FEED (LF)
1030                         U+000C FORM FEED (FF)
1031                         U+0020 SPACE
1032                         Switch to the before attribute name state. */
1033                         $state = 'before attribute name';
1034
1035                     } elseif ($char === '&') {
1036                         /* U+0026 AMPERSAND (&)
1037                         Switch to the entity in attribute value state, with the
1038                         additional allowed character  being U+003E
1039                         GREATER-THAN SIGN (>). */
1040                         $this->characterReferenceInAttributeValue('>');
1041
1042                     } elseif ($char === '>') {
1043                         /* U+003E GREATER-THAN SIGN (>)
1044                         Emit the current tag token. Switch to the data state. */
1045                         $this->emitToken($this->token);
1046                         $state = 'data';
1047
1048                     } elseif ($char === false) {
1049                         /* EOF
1050                         Parse error. Reconsume the EOF character in the data state. */
1051                         $this->emitToken(array(
1052                             'type' => self::PARSEERROR,
1053                             'data' => 'eof-in-attribute-value-no-quotes'
1054                         ));
1055                         $this->stream->unget();
1056                         $state = 'data';
1057
1058                     } else {
1059                         /* U+0022 QUOTATION MARK (")
1060                            U+0027 APOSTROPHE (')
1061                            U+003C LESS-THAN SIGN (<)
1062                            U+003D EQUALS SIGN (=)
1063                         Parse error. Treat it as per the "anything else"
1064                         entry below. */
1065                         if ($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1066                             $this->emitToken(array(
1067                                 'type' => self::PARSEERROR,
1068                                 'data' => 'unexpected-character-in-unquoted-attribute-value'
1069                             ));
1070                         }
1071
1072                         /* Anything else
1073                         Append the current input character to the current attribute's value.
1074                         Stay in the attribute value (unquoted) state. */
1075                         $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1076
1077                         $last = count($this->token['attr']) - 1;
1078                         $this->token['attr'][$last]['value'] .= $char . $chars;
1079
1080                         $state = 'attribute value (unquoted)';
1081                     }
1082                 break;
1083
1084                 case 'after attribute value (quoted)':
1085                     /* Consume the next input character: */
1086                     $char = $this->stream->char();
1087
1088                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1089                         /* U+0009 CHARACTER TABULATION
1090                            U+000A LINE FEED (LF)
1091                            U+000C FORM FEED (FF)
1092                            U+0020 SPACE
1093                         Switch to the before attribute name state. */
1094                         $state = 'before attribute name';
1095
1096                     } elseif ($char === '/') {
1097                         /* U+002F SOLIDUS (/)
1098                         Switch to the self-closing start tag state. */
1099                         $state = 'self-closing start tag';
1100
1101                     } elseif ($char === '>') {
1102                         /* U+003E GREATER-THAN SIGN (>)
1103                         Emit the current tag token. Switch to the data state. */
1104                         $this->emitToken($this->token);
1105                         $state = 'data';
1106
1107                     } elseif ($char === false) {
1108                         /* EOF
1109                         Parse error. Reconsume the EOF character in the data state. */
1110                         $this->emitToken(array(
1111                             'type' => self::PARSEERROR,
1112                             'data' => 'unexpected-EOF-after-attribute-value'
1113                         ));
1114                         $this->stream->unget();
1115                         $state = 'data';
1116
1117                     } else {
1118                         /* Anything else
1119                         Parse error. Reconsume the character in the before attribute
1120                         name state. */
1121                         $this->emitToken(array(
1122                             'type' => self::PARSEERROR,
1123                             'data' => 'unexpected-character-after-attribute-value'
1124                         ));
1125                         $this->stream->unget();
1126                         $state = 'before attribute name';
1127                     }
1128                 break;
1129
1130                 case 'self-closing start tag':
1131                     /* Consume the next input character: */
1132                     $char = $this->stream->char();
1133
1134                     if ($char === '>') {
1135                         /* U+003E GREATER-THAN SIGN (>)
1136                         Set the self-closing flag of the current tag token.
1137                         Emit the current tag token. Switch to the data state. */
1138                         // not sure if this is the name we want
1139                         $this->token['self-closing'] = true;
1140                         $this->emitToken($this->token);
1141                         $state = 'data';
1142
1143                     } elseif ($char === false) {
1144                         /* EOF
1145                         Parse error. Reconsume the EOF character in the data state. */
1146                         $this->emitToken(array(
1147                             'type' => self::PARSEERROR,
1148                             'data' => 'unexpected-eof-after-self-closing'
1149                         ));
1150                         $this->stream->unget();
1151                         $state = 'data';
1152
1153                     } else {
1154                         /* Anything else
1155                         Parse error. Reconsume the character in the before attribute name state. */
1156                         $this->emitToken(array(
1157                             'type' => self::PARSEERROR,
1158                             'data' => 'unexpected-character-after-self-closing'
1159                         ));
1160                         $this->stream->unget();
1161                         $state = 'before attribute name';
1162                     }
1163                 break;
1164
1165                 case 'bogus comment':
1166                     /* (This can only happen if the content model flag is set to the PCDATA state.) */
1167                     /* Consume every character up to the first U+003E GREATER-THAN SIGN
1168                     character (>) or the end of the file (EOF), whichever comes first. Emit
1169                     a comment token whose data is the concatenation of all the characters
1170                     starting from and including the character that caused the state machine
1171                     to switch into the bogus comment state, up to and including the last
1172                     consumed character before the U+003E character, if any, or up to the
1173                     end of the file otherwise. (If the comment was started by the end of
1174                     the file (EOF), the token is empty.) */
1175                     $this->token['data'] .= (string) $this->stream->charsUntil('>');
1176                     $this->stream->char();
1177
1178                     $this->emitToken($this->token);
1179
1180                     /* Switch to the data state. */
1181                     $state = 'data';
1182                 break;
1183
1184                 case 'markup declaration open':
1185                     // Consume for below
1186                     $hyphens = $this->stream->charsWhile('-', 2);
1187                     if ($hyphens === '-') {
1188                         $this->stream->unget();
1189                     }
1190                     if ($hyphens !== '--') {
1191                         $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1192                     }
1193
1194                     /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1195                     characters, consume those two characters, create a comment token whose
1196                     data is the empty string, and switch to the comment state. */
1197                     if ($hyphens === '--') {
1198                         $state = 'comment start';
1199                         $this->token = array(
1200                             'data' => '',
1201                             'type' => self::COMMENT
1202                         );
1203
1204                     /* Otherwise if the next seven characters are a case-insensitive match
1205                     for the word "DOCTYPE", then consume those characters and switch to the
1206                     DOCTYPE state. */
1207                     } elseif (strtoupper($alpha) === 'DOCTYPE') {
1208                         $state = 'DOCTYPE';
1209
1210                     // XXX not implemented
1211                     /* Otherwise, if the insertion mode is "in foreign content"
1212                     and the current node is not an element in the HTML namespace
1213                     and the next seven characters are an ASCII case-sensitive
1214                     match for the string "[CDATA[" (the five uppercase letters
1215                     "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1216                     and after), then consume those characters and switch to the
1217                     CDATA section state (which is unrelated to the content model
1218                     flag's CDATA state). */
1219
1220                     /* Otherwise, is is a parse error. Switch to the bogus comment state.
1221                     The next character that is consumed, if any, is the first character
1222                     that will be in the comment. */
1223                     } else {
1224                         $this->emitToken(array(
1225                             'type' => self::PARSEERROR,
1226                             'data' => 'expected-dashes-or-doctype'
1227                         ));
1228                         $this->token = array(
1229                             'data' => (string) $alpha,
1230                             'type' => self::COMMENT
1231                         );
1232                         $state = 'bogus comment';
1233                     }
1234                 break;
1235
1236                 case 'comment start':
1237                     /* Consume the next input character: */
1238                     $char = $this->stream->char();
1239
1240                     if ($char === '-') {
1241                         /* U+002D HYPHEN-MINUS (-)
1242                         Switch to the comment start dash state. */
1243                         $state = 'comment start dash';
1244                     } elseif ($char === '>') {
1245                         /* U+003E GREATER-THAN SIGN (>)
1246                         Parse error. Emit the comment token. Switch to the
1247                         data state. */
1248                         $this->emitToken(array(
1249                             'type' => self::PARSEERROR,
1250                             'data' => 'incorrect-comment'
1251                         ));
1252                         $this->emitToken($this->token);
1253                         $state = 'data';
1254                     } elseif ($char === false) {
1255                         /* EOF
1256                         Parse error. Emit the comment token. Reconsume the
1257                         EOF character in the data state. */
1258                         $this->emitToken(array(
1259                             'type' => self::PARSEERROR,
1260                             'data' => 'eof-in-comment'
1261                         ));
1262                         $this->emitToken($this->token);
1263                         $this->stream->unget();
1264                         $state = 'data';
1265                     } else {
1266                         /* Anything else
1267                         Append the input character to the comment token's
1268                         data. Switch to the comment state. */
1269                         $this->token['data'] .= $char;
1270                         $state = 'comment';
1271                     }
1272                 break;
1273
1274                 case 'comment start dash':
1275                     /* Consume the next input character: */
1276                     $char = $this->stream->char();
1277                     if ($char === '-') {
1278                         /* U+002D HYPHEN-MINUS (-)
1279                         Switch to the comment end state */
1280                         $state = 'comment end';
1281                     } elseif ($char === '>') {
1282                         /* U+003E GREATER-THAN SIGN (>)
1283                         Parse error. Emit the comment token. Switch to the
1284                         data state. */
1285                         $this->emitToken(array(
1286                             'type' => self::PARSEERROR,
1287                             'data' => 'incorrect-comment'
1288                         ));
1289                         $this->emitToken($this->token);
1290                         $state = 'data';
1291                     } elseif ($char === false) {
1292                         /* Parse error. Emit the comment token. Reconsume the
1293                         EOF character in the data state. */
1294                         $this->emitToken(array(
1295                             'type' => self::PARSEERROR,
1296                             'data' => 'eof-in-comment'
1297                         ));
1298                         $this->emitToken($this->token);
1299                         $this->stream->unget();
1300                         $state = 'data';
1301                     } else {
1302                         $this->token['data'] .= '-' . $char;
1303                         $state = 'comment';
1304                     }
1305                 break;
1306
1307                 case 'comment':
1308                     /* Consume the next input character: */
1309                     $char = $this->stream->char();
1310
1311                     if ($char === '-') {
1312                         /* U+002D HYPHEN-MINUS (-)
1313                         Switch to the comment end dash state */
1314                         $state = 'comment end dash';
1315
1316                     } elseif ($char === false) {
1317                         /* EOF
1318                         Parse error. Emit the comment token. Reconsume the EOF character
1319                         in the data state. */
1320                         $this->emitToken(array(
1321                             'type' => self::PARSEERROR,
1322                             'data' => 'eof-in-comment'
1323                         ));
1324                         $this->emitToken($this->token);
1325                         $this->stream->unget();
1326                         $state = 'data';
1327
1328                     } else {
1329                         /* Anything else
1330                         Append the input character to the comment token's data. Stay in
1331                         the comment state. */
1332                         $chars = $this->stream->charsUntil('-');
1333
1334                         $this->token['data'] .= $char . $chars;
1335                     }
1336                 break;
1337
1338                 case 'comment end dash':
1339                     /* Consume the next input character: */
1340                     $char = $this->stream->char();
1341
1342                     if ($char === '-') {
1343                         /* U+002D HYPHEN-MINUS (-)
1344                         Switch to the comment end state  */
1345                         $state = 'comment end';
1346
1347                     } elseif ($char === false) {
1348                         /* EOF
1349                         Parse error. Emit the comment token. Reconsume the EOF character
1350                         in the data state. */
1351                         $this->emitToken(array(
1352                             'type' => self::PARSEERROR,
1353                             'data' => 'eof-in-comment-end-dash'
1354                         ));
1355                         $this->emitToken($this->token);
1356                         $this->stream->unget();
1357                         $state = 'data';
1358
1359                     } else {
1360                         /* Anything else
1361                         Append a U+002D HYPHEN-MINUS (-) character and the input
1362                         character to the comment token's data. Switch to the comment state. */
1363                         $this->token['data'] .= '-'.$char;
1364                         $state = 'comment';
1365                     }
1366                 break;
1367
1368                 case 'comment end':
1369                     /* Consume the next input character: */
1370                     $char = $this->stream->char();
1371
1372                     if ($char === '>') {
1373                         /* U+003E GREATER-THAN SIGN (>)
1374                         Emit the comment token. Switch to the data state. */
1375                         $this->emitToken($this->token);
1376                         $state = 'data';
1377
1378                     } elseif ($char === '-') {
1379                         /* U+002D HYPHEN-MINUS (-)
1380                         Parse error. Append a U+002D HYPHEN-MINUS (-) character
1381                         to the comment token's data. Stay in the comment end
1382                         state. */
1383                         $this->emitToken(array(
1384                             'type' => self::PARSEERROR,
1385                             'data' => 'unexpected-dash-after-double-dash-in-comment'
1386                         ));
1387                         $this->token['data'] .= '-';
1388
1389                     } elseif ($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1390                         $this->emitToken(array(
1391                             'type' => self::PARSEERROR,
1392                             'data' => 'unexpected-space-after-double-dash-in-comment'
1393                         ));
1394                         $this->token['data'] .= '--' . $char;
1395                         $state = 'comment end space';
1396
1397                     } elseif ($char === '!') {
1398                         $this->emitToken(array(
1399                             'type' => self::PARSEERROR,
1400                             'data' => 'unexpected-bang-after-double-dash-in-comment'
1401                         ));
1402                         $state = 'comment end bang';
1403
1404                     } elseif ($char === false) {
1405                         /* EOF
1406                         Parse error. Emit the comment token. Reconsume the
1407                         EOF character in the data state. */
1408                         $this->emitToken(array(
1409                             'type' => self::PARSEERROR,
1410                             'data' => 'eof-in-comment-double-dash'
1411                         ));
1412                         $this->emitToken($this->token);
1413                         $this->stream->unget();
1414                         $state = 'data';
1415
1416                     } else {
1417                         /* Anything else
1418                         Parse error. Append two U+002D HYPHEN-MINUS (-)
1419                         characters and the input character to the comment token's
1420                         data. Switch to the comment state. */
1421                         $this->emitToken(array(
1422                             'type' => self::PARSEERROR,
1423                             'data' => 'unexpected-char-in-comment'
1424                         ));
1425                         $this->token['data'] .= '--'.$char;
1426                         $state = 'comment';
1427                     }
1428                 break;
1429
1430                 case 'comment end bang':
1431                     $char = $this->stream->char();
1432                     if ($char === '>') {
1433                         $this->emitToken($this->token);
1434                         $state = 'data';
1435                     } elseif ($char === "-") {
1436                         $this->token['data'] .= '--!';
1437                         $state = 'comment end dash';
1438                     } elseif ($char === false) {
1439                         $this->emitToken(array(
1440                             'type' => self::PARSEERROR,
1441                             'data' => 'eof-in-comment-end-bang'
1442                         ));
1443                         $this->emitToken($this->token);
1444                         $this->stream->unget();
1445                         $state = 'data';
1446                     } else {
1447                         $this->token['data'] .= '--!' . $char;
1448                         $state = 'comment';
1449                     }
1450                 break;
1451
1452                 case 'comment end space':
1453                     $char = $this->stream->char();
1454                     if ($char === '>') {
1455                         $this->emitToken($this->token);
1456                         $state = 'data';
1457                     } elseif ($char === '-') {
1458                         $state = 'comment end dash';
1459                     } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1460                         $this->token['data'] .= $char;
1461                     } elseif ($char === false) {
1462                         $this->emitToken(array(
1463                             'type' => self::PARSEERROR,
1464                             'data' => 'unexpected-eof-in-comment-end-space',
1465                         ));
1466                         $this->emitToken($this->token);
1467                         $this->stream->unget();
1468                         $state = 'data';
1469                     } else {
1470                         $this->token['data'] .= $char;
1471                         $state = 'comment';
1472                     }
1473                 break;
1474
1475                 case 'DOCTYPE':
1476                     /* Consume the next input character: */
1477                     $char = $this->stream->char();
1478
1479                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1480                         /* U+0009 CHARACTER TABULATION
1481                            U+000A LINE FEED (LF)
1482                            U+000C FORM FEED (FF)
1483                            U+0020 SPACE
1484                         Switch to the before DOCTYPE name state. */
1485                         $state = 'before DOCTYPE name';
1486
1487                     } elseif ($char === false) {
1488                         /* EOF
1489                         Parse error. Create a new DOCTYPE token. Set its
1490                         force-quirks flag to on. Emit the token. Reconsume the
1491                         EOF character in the data state. */
1492                         $this->emitToken(array(
1493                             'type' => self::PARSEERROR,
1494                             'data' => 'need-space-after-doctype-but-got-eof'
1495                         ));
1496                         $this->emitToken(array(
1497                             'name' => '',
1498                             'type' => self::DOCTYPE,
1499                             'force-quirks' => true,
1500                             'error' => true
1501                         ));
1502                         $this->stream->unget();
1503                         $state = 'data';
1504
1505                     } else {
1506                         /* Anything else
1507                         Parse error. Reconsume the current character in the
1508                         before DOCTYPE name state. */
1509                         $this->emitToken(array(
1510                             'type' => self::PARSEERROR,
1511                             'data' => 'need-space-after-doctype'
1512                         ));
1513                         $this->stream->unget();
1514                         $state = 'before DOCTYPE name';
1515                     }
1516                 break;
1517
1518                 case 'before DOCTYPE name':
1519                     /* Consume the next input character: */
1520                     $char = $this->stream->char();
1521
1522                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1523                         /* U+0009 CHARACTER TABULATION
1524                            U+000A LINE FEED (LF)
1525                            U+000C FORM FEED (FF)
1526                            U+0020 SPACE
1527                         Stay in the before DOCTYPE name state. */
1528
1529                     } elseif ($char === '>') {
1530                         /* U+003E GREATER-THAN SIGN (>)
1531                         Parse error. Create a new DOCTYPE token. Set its
1532                         force-quirks flag to on. Emit the token. Switch to the
1533                         data state. */
1534                         $this->emitToken(array(
1535                             'type' => self::PARSEERROR,
1536                             'data' => 'expected-doctype-name-but-got-right-bracket'
1537                         ));
1538                         $this->emitToken(array(
1539                             'name' => '',
1540                             'type' => self::DOCTYPE,
1541                             'force-quirks' => true,
1542                             'error' => true
1543                         ));
1544
1545                         $state = 'data';
1546
1547                     } elseif ('A' <= $char && $char <= 'Z') {
1548                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1549                         Create a new DOCTYPE token. Set the token's name to the
1550                         lowercase version of the input character (add 0x0020 to
1551                         the character's code point). Switch to the DOCTYPE name
1552                         state. */
1553                         $this->token = array(
1554                             'name' => strtolower($char),
1555                             'type' => self::DOCTYPE,
1556                             'error' => true
1557                         );
1558
1559                         $state = 'DOCTYPE name';
1560
1561                     } elseif ($char === false) {
1562                         /* EOF
1563                         Parse error. Create a new DOCTYPE token. Set its
1564                         force-quirks flag to on. Emit the token. Reconsume the
1565                         EOF character in the data state. */
1566                         $this->emitToken(array(
1567                             'type' => self::PARSEERROR,
1568                             'data' => 'expected-doctype-name-but-got-eof'
1569                         ));
1570                         $this->emitToken(array(
1571                             'name' => '',
1572                             'type' => self::DOCTYPE,
1573                             'force-quirks' => true,
1574                             'error' => true
1575                         ));
1576
1577                         $this->stream->unget();
1578                         $state = 'data';
1579
1580                     } else {
1581                         /* Anything else
1582                         Create a new DOCTYPE token. Set the token's name to the
1583                         current input character. Switch to the DOCTYPE name state. */
1584                         $this->token = array(
1585                             'name' => $char,
1586                             'type' => self::DOCTYPE,
1587                             'error' => true
1588                         );
1589
1590                         $state = 'DOCTYPE name';
1591                     }
1592                 break;
1593
1594                 case 'DOCTYPE name':
1595                     /* Consume the next input character: */
1596                     $char = $this->stream->char();
1597
1598                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1599                         /* U+0009 CHARACTER TABULATION
1600                            U+000A LINE FEED (LF)
1601                            U+000C FORM FEED (FF)
1602                            U+0020 SPACE
1603                         Switch to the after DOCTYPE name state. */
1604                         $state = 'after DOCTYPE name';
1605
1606                     } elseif ($char === '>') {
1607                         /* U+003E GREATER-THAN SIGN (>)
1608                         Emit the current DOCTYPE token. Switch to the data state. */
1609                         $this->emitToken($this->token);
1610                         $state = 'data';
1611
1612                     } elseif ('A' <= $char && $char <= 'Z') {
1613                         /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1614                         Append the lowercase version of the input character
1615                         (add 0x0020 to the character's code point) to the current
1616                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1617                         $this->token['name'] .= strtolower($char);
1618
1619                     } elseif ($char === false) {
1620                         /* EOF
1621                         Parse error. Set the DOCTYPE token's force-quirks flag
1622                         to on. Emit that DOCTYPE token. Reconsume the EOF
1623                         character in the data state. */
1624                         $this->emitToken(array(
1625                             'type' => self::PARSEERROR,
1626                             'data' => 'eof-in-doctype-name'
1627                         ));
1628                         $this->token['force-quirks'] = true;
1629                         $this->emitToken($this->token);
1630                         $this->stream->unget();
1631                         $state = 'data';
1632
1633                     } else {
1634                         /* Anything else
1635                         Append the current input character to the current
1636                         DOCTYPE token's name. Stay in the DOCTYPE name state. */
1637                         $this->token['name'] .= $char;
1638                     }
1639
1640                     // XXX this is probably some sort of quirks mode designation,
1641                     // check tree-builder to be sure. In general 'error' needs
1642                     // to be specc'ified, this probably means removing it at the end
1643                     $this->token['error'] = ($this->token['name'] === 'HTML')
1644                         ? false
1645                         : true;
1646                 break;
1647
1648                 case 'after DOCTYPE name':
1649                     /* Consume the next input character: */
1650                     $char = $this->stream->char();
1651
1652                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1653                         /* U+0009 CHARACTER TABULATION
1654                            U+000A LINE FEED (LF)
1655                            U+000C FORM FEED (FF)
1656                            U+0020 SPACE
1657                         Stay in the after DOCTYPE name state. */
1658
1659                     } elseif ($char === '>') {
1660                         /* U+003E GREATER-THAN SIGN (>)
1661                         Emit the current DOCTYPE token. Switch to the data state. */
1662                         $this->emitToken($this->token);
1663                         $state = 'data';
1664
1665                     } elseif ($char === false) {
1666                         /* EOF
1667                         Parse error. Set the DOCTYPE token's force-quirks flag
1668                         to on. Emit that DOCTYPE token. Reconsume the EOF
1669                         character in the data state. */
1670                         $this->emitToken(array(
1671                             'type' => self::PARSEERROR,
1672                             'data' => 'eof-in-doctype'
1673                         ));
1674                         $this->token['force-quirks'] = true;
1675                         $this->emitToken($this->token);
1676                         $this->stream->unget();
1677                         $state = 'data';
1678
1679                     } else {
1680                         /* Anything else */
1681
1682                         $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1683                         if ($nextSix === 'PUBLIC') {
1684                             /* If the next six characters are an ASCII
1685                             case-insensitive match for the word "PUBLIC", then
1686                             consume those characters and switch to the before
1687                             DOCTYPE public identifier state. */
1688                             $state = 'before DOCTYPE public identifier';
1689
1690                         } elseif ($nextSix === 'SYSTEM') {
1691                             /* Otherwise, if the next six characters are an ASCII
1692                             case-insensitive match for the word "SYSTEM", then
1693                             consume those characters and switch to the before
1694                             DOCTYPE system identifier state. */
1695                             $state = 'before DOCTYPE system identifier';
1696
1697                         } else {
1698                             /* Otherwise, this is the parse error. Set the DOCTYPE
1699                             token's force-quirks flag to on. Switch to the bogus
1700                             DOCTYPE state. */
1701                             $this->emitToken(array(
1702                                 'type' => self::PARSEERROR,
1703                                 'data' => 'expected-space-or-right-bracket-in-doctype'
1704                             ));
1705                             $this->token['force-quirks'] = true;
1706                             $this->token['error'] = true;
1707                             $state = 'bogus DOCTYPE';
1708                         }
1709                     }
1710                 break;
1711
1712                 case 'before DOCTYPE public identifier':
1713                     /* Consume the next input character: */
1714                     $char = $this->stream->char();
1715
1716                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1717                         /* U+0009 CHARACTER TABULATION
1718                            U+000A LINE FEED (LF)
1719                            U+000C FORM FEED (FF)
1720                            U+0020 SPACE
1721                         Stay in the before DOCTYPE public identifier state. */
1722                     } elseif ($char === '"') {
1723                         /* U+0022 QUOTATION MARK (")
1724                         Set the DOCTYPE token's public identifier to the empty
1725                         string (not missing), then switch to the DOCTYPE public
1726                         identifier (double-quoted) state. */
1727                         $this->token['public'] = '';
1728                         $state = 'DOCTYPE public identifier (double-quoted)';
1729                     } elseif ($char === "'") {
1730                         /* U+0027 APOSTROPHE (')
1731                         Set the DOCTYPE token's public identifier to the empty
1732                         string (not missing), then switch to the DOCTYPE public
1733                         identifier (single-quoted) state. */
1734                         $this->token['public'] = '';
1735                         $state = 'DOCTYPE public identifier (single-quoted)';
1736                     } elseif ($char === '>') {
1737                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1738                         to on. Emit that DOCTYPE token. Switch to the data state. */
1739                         $this->emitToken(array(
1740                             'type' => self::PARSEERROR,
1741                             'data' => 'unexpected-end-of-doctype'
1742                         ));
1743                         $this->token['force-quirks'] = true;
1744                         $this->emitToken($this->token);
1745                         $state = 'data';
1746                     } elseif ($char === false) {
1747                         /* Parse error. Set the DOCTYPE token's force-quirks
1748                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1749                         character in the data state. */
1750                         $this->emitToken(array(
1751                             'type' => self::PARSEERROR,
1752                             'data' => 'eof-in-doctype'
1753                         ));
1754                         $this->token['force-quirks'] = true;
1755                         $this->emitToken($this->token);
1756                         $this->stream->unget();
1757                         $state = 'data';
1758                     } else {
1759                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1760                         to on. Switch to the bogus DOCTYPE state. */
1761                         $this->emitToken(array(
1762                             'type' => self::PARSEERROR,
1763                             'data' => 'unexpected-char-in-doctype'
1764                         ));
1765                         $this->token['force-quirks'] = true;
1766                         $state = 'bogus DOCTYPE';
1767                     }
1768                 break;
1769
1770                 case 'DOCTYPE public identifier (double-quoted)':
1771                     /* Consume the next input character: */
1772                     $char = $this->stream->char();
1773
1774                     if ($char === '"') {
1775                         /* U+0022 QUOTATION MARK (")
1776                         Switch to the after DOCTYPE public identifier state. */
1777                         $state = 'after DOCTYPE public identifier';
1778                     } elseif ($char === '>') {
1779                         /* U+003E GREATER-THAN SIGN (>)
1780                         Parse error. Set the DOCTYPE token's force-quirks flag
1781                         to on. Emit that DOCTYPE token. Switch to the data state. */
1782                         $this->emitToken(array(
1783                             'type' => self::PARSEERROR,
1784                             'data' => 'unexpected-end-of-doctype'
1785                         ));
1786                         $this->token['force-quirks'] = true;
1787                         $this->emitToken($this->token);
1788                         $state = 'data';
1789                     } elseif ($char === false) {
1790                         /* EOF
1791                         Parse error. Set the DOCTYPE token's force-quirks flag
1792                         to on. Emit that DOCTYPE token. Reconsume the EOF
1793                         character in the data state. */
1794                         $this->emitToken(array(
1795                             'type' => self::PARSEERROR,
1796                             'data' => 'eof-in-doctype'
1797                         ));
1798                         $this->token['force-quirks'] = true;
1799                         $this->emitToken($this->token);
1800                         $this->stream->unget();
1801                         $state = 'data';
1802                     } else {
1803                         /* Anything else
1804                         Append the current input character to the current
1805                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1806                         public identifier (double-quoted) state. */
1807                         $this->token['public'] .= $char;
1808                     }
1809                 break;
1810
1811                 case 'DOCTYPE public identifier (single-quoted)':
1812                     /* Consume the next input character: */
1813                     $char = $this->stream->char();
1814
1815                     if ($char === "'") {
1816                         /* U+0027 APOSTROPHE (')
1817                         Switch to the after DOCTYPE public identifier state. */
1818                         $state = 'after DOCTYPE public identifier';
1819                     } elseif ($char === '>') {
1820                         /* U+003E GREATER-THAN SIGN (>)
1821                         Parse error. Set the DOCTYPE token's force-quirks flag
1822                         to on. Emit that DOCTYPE token. Switch to the data state. */
1823                         $this->emitToken(array(
1824                             'type' => self::PARSEERROR,
1825                             'data' => 'unexpected-end-of-doctype'
1826                         ));
1827                         $this->token['force-quirks'] = true;
1828                         $this->emitToken($this->token);
1829                         $state = 'data';
1830                     } elseif ($char === false) {
1831                         /* EOF
1832                         Parse error. Set the DOCTYPE token's force-quirks flag
1833                         to on. Emit that DOCTYPE token. Reconsume the EOF
1834                         character in the data state. */
1835                         $this->emitToken(array(
1836                             'type' => self::PARSEERROR,
1837                             'data' => 'eof-in-doctype'
1838                         ));
1839                         $this->token['force-quirks'] = true;
1840                         $this->emitToken($this->token);
1841                         $this->stream->unget();
1842                         $state = 'data';
1843                     } else {
1844                         /* Anything else
1845                         Append the current input character to the current
1846                         DOCTYPE token's public identifier. Stay in the DOCTYPE
1847                         public identifier (double-quoted) state. */
1848                         $this->token['public'] .= $char;
1849                     }
1850                 break;
1851
1852                 case 'after DOCTYPE public identifier':
1853                     /* Consume the next input character: */
1854                     $char = $this->stream->char();
1855
1856                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1857                         /* U+0009 CHARACTER TABULATION
1858                            U+000A LINE FEED (LF)
1859                            U+000C FORM FEED (FF)
1860                            U+0020 SPACE
1861                         Stay in the after DOCTYPE public identifier state. */
1862                     } elseif ($char === '"') {
1863                         /* U+0022 QUOTATION MARK (")
1864                         Set the DOCTYPE token's system identifier to the
1865                         empty string (not missing), then switch to the DOCTYPE
1866                         system identifier (double-quoted) state. */
1867                         $this->token['system'] = '';
1868                         $state = 'DOCTYPE system identifier (double-quoted)';
1869                     } elseif ($char === "'") {
1870                         /* U+0027 APOSTROPHE (')
1871                         Set the DOCTYPE token's system identifier to the
1872                         empty string (not missing), then switch to the DOCTYPE
1873                         system identifier (single-quoted) state. */
1874                         $this->token['system'] = '';
1875                         $state = 'DOCTYPE system identifier (single-quoted)';
1876                     } elseif ($char === '>') {
1877                         /* U+003E GREATER-THAN SIGN (>)
1878                         Emit the current DOCTYPE token. Switch to the data state. */
1879                         $this->emitToken($this->token);
1880                         $state = 'data';
1881                     } elseif ($char === false) {
1882                         /* Parse error. Set the DOCTYPE token's force-quirks
1883                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1884                         character in the data state. */
1885                         $this->emitToken(array(
1886                             'type' => self::PARSEERROR,
1887                             'data' => 'eof-in-doctype'
1888                         ));
1889                         $this->token['force-quirks'] = true;
1890                         $this->emitToken($this->token);
1891                         $this->stream->unget();
1892                         $state = 'data';
1893                     } else {
1894                         /* Anything else
1895                         Parse error. Set the DOCTYPE token's force-quirks flag
1896                         to on. Switch to the bogus DOCTYPE state. */
1897                         $this->emitToken(array(
1898                             'type' => self::PARSEERROR,
1899                             'data' => 'unexpected-char-in-doctype'
1900                         ));
1901                         $this->token['force-quirks'] = true;
1902                         $state = 'bogus DOCTYPE';
1903                     }
1904                 break;
1905
1906                 case 'before DOCTYPE system identifier':
1907                     /* Consume the next input character: */
1908                     $char = $this->stream->char();
1909
1910                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1911                         /* U+0009 CHARACTER TABULATION
1912                            U+000A LINE FEED (LF)
1913                            U+000C FORM FEED (FF)
1914                            U+0020 SPACE
1915                         Stay in the before DOCTYPE system identifier state. */
1916                     } elseif ($char === '"') {
1917                         /* U+0022 QUOTATION MARK (")
1918                         Set the DOCTYPE token's system identifier to the empty
1919                         string (not missing), then switch to the DOCTYPE system
1920                         identifier (double-quoted) state. */
1921                         $this->token['system'] = '';
1922                         $state = 'DOCTYPE system identifier (double-quoted)';
1923                     } elseif ($char === "'") {
1924                         /* U+0027 APOSTROPHE (')
1925                         Set the DOCTYPE token's system identifier to the empty
1926                         string (not missing), then switch to the DOCTYPE system
1927                         identifier (single-quoted) state. */
1928                         $this->token['system'] = '';
1929                         $state = 'DOCTYPE system identifier (single-quoted)';
1930                     } elseif ($char === '>') {
1931                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1932                         to on. Emit that DOCTYPE token. Switch to the data state. */
1933                         $this->emitToken(array(
1934                             'type' => self::PARSEERROR,
1935                             'data' => 'unexpected-char-in-doctype'
1936                         ));
1937                         $this->token['force-quirks'] = true;
1938                         $this->emitToken($this->token);
1939                         $state = 'data';
1940                     } elseif ($char === false) {
1941                         /* Parse error. Set the DOCTYPE token's force-quirks
1942                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
1943                         character in the data state. */
1944                         $this->emitToken(array(
1945                             'type' => self::PARSEERROR,
1946                             'data' => 'eof-in-doctype'
1947                         ));
1948                         $this->token['force-quirks'] = true;
1949                         $this->emitToken($this->token);
1950                         $this->stream->unget();
1951                         $state = 'data';
1952                     } else {
1953                         /* Parse error. Set the DOCTYPE token's force-quirks flag
1954                         to on. Switch to the bogus DOCTYPE state. */
1955                         $this->emitToken(array(
1956                             'type' => self::PARSEERROR,
1957                             'data' => 'unexpected-char-in-doctype'
1958                         ));
1959                         $this->token['force-quirks'] = true;
1960                         $state = 'bogus DOCTYPE';
1961                     }
1962                 break;
1963
1964                 case 'DOCTYPE system identifier (double-quoted)':
1965                     /* Consume the next input character: */
1966                     $char = $this->stream->char();
1967
1968                     if ($char === '"') {
1969                         /* U+0022 QUOTATION MARK (")
1970                         Switch to the after DOCTYPE system identifier state. */
1971                         $state = 'after DOCTYPE system identifier';
1972                     } elseif ($char === '>') {
1973                         /* U+003E GREATER-THAN SIGN (>)
1974                         Parse error. Set the DOCTYPE token's force-quirks flag
1975                         to on. Emit that DOCTYPE token. Switch to the data state. */
1976                         $this->emitToken(array(
1977                             'type' => self::PARSEERROR,
1978                             'data' => 'unexpected-end-of-doctype'
1979                         ));
1980                         $this->token['force-quirks'] = true;
1981                         $this->emitToken($this->token);
1982                         $state = 'data';
1983                     } elseif ($char === false) {
1984                         /* EOF
1985                         Parse error. Set the DOCTYPE token's force-quirks flag
1986                         to on. Emit that DOCTYPE token. Reconsume the EOF
1987                         character in the data state. */
1988                         $this->emitToken(array(
1989                             'type' => self::PARSEERROR,
1990                             'data' => 'eof-in-doctype'
1991                         ));
1992                         $this->token['force-quirks'] = true;
1993                         $this->emitToken($this->token);
1994                         $this->stream->unget();
1995                         $state = 'data';
1996                     } else {
1997                         /* Anything else
1998                         Append the current input character to the current
1999                         DOCTYPE token's system identifier. Stay in the DOCTYPE
2000                         system identifier (double-quoted) state. */
2001                         $this->token['system'] .= $char;
2002                     }
2003                 break;
2004
2005                 case 'DOCTYPE system identifier (single-quoted)':
2006                     /* Consume the next input character: */
2007                     $char = $this->stream->char();
2008
2009                     if ($char === "'") {
2010                         /* U+0027 APOSTROPHE (')
2011                         Switch to the after DOCTYPE system identifier state. */
2012                         $state = 'after DOCTYPE system identifier';
2013                     } elseif ($char === '>') {
2014                         /* U+003E GREATER-THAN SIGN (>)
2015                         Parse error. Set the DOCTYPE token's force-quirks flag
2016                         to on. Emit that DOCTYPE token. Switch to the data state. */
2017                         $this->emitToken(array(
2018                             'type' => self::PARSEERROR,
2019                             'data' => 'unexpected-end-of-doctype'
2020                         ));
2021                         $this->token['force-quirks'] = true;
2022                         $this->emitToken($this->token);
2023                         $state = 'data';
2024                     } elseif ($char === false) {
2025                         /* EOF
2026                         Parse error. Set the DOCTYPE token's force-quirks flag
2027                         to on. Emit that DOCTYPE token. Reconsume the EOF
2028                         character in the data state. */
2029                         $this->emitToken(array(
2030                             'type' => self::PARSEERROR,
2031                             'data' => 'eof-in-doctype'
2032                         ));
2033                         $this->token['force-quirks'] = true;
2034                         $this->emitToken($this->token);
2035                         $this->stream->unget();
2036                         $state = 'data';
2037                     } else {
2038                         /* Anything else
2039                         Append the current input character to the current
2040                         DOCTYPE token's system identifier. Stay in the DOCTYPE
2041                         system identifier (double-quoted) state. */
2042                         $this->token['system'] .= $char;
2043                     }
2044                 break;
2045
2046                 case 'after DOCTYPE system identifier':
2047                     /* Consume the next input character: */
2048                     $char = $this->stream->char();
2049
2050                     if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2051                         /* U+0009 CHARACTER TABULATION
2052                            U+000A LINE FEED (LF)
2053                            U+000C FORM FEED (FF)
2054                            U+0020 SPACE
2055                         Stay in the after DOCTYPE system identifier state. */
2056                     } elseif ($char === '>') {
2057                         /* U+003E GREATER-THAN SIGN (>)
2058                         Emit the current DOCTYPE token. Switch to the data state. */
2059                         $this->emitToken($this->token);
2060                         $state = 'data';
2061                     } elseif ($char === false) {
2062                         /* Parse error. Set the DOCTYPE token's force-quirks
2063                         flag to on. Emit that DOCTYPE token. Reconsume the EOF
2064                         character in the data state. */
2065                         $this->emitToken(array(
2066                             'type' => self::PARSEERROR,
2067                             'data' => 'eof-in-doctype'
2068                         ));
2069                         $this->token['force-quirks'] = true;
2070                         $this->emitToken($this->token);
2071                         $this->stream->unget();
2072                         $state = 'data';
2073                     } else {
2074                         /* Anything else
2075                         Parse error. Switch to the bogus DOCTYPE state.
2076                         (This does not set the DOCTYPE token's force-quirks
2077                         flag to on.) */
2078                         $this->emitToken(array(
2079                             'type' => self::PARSEERROR,
2080                             'data' => 'unexpected-char-in-doctype'
2081                         ));
2082                         $state = 'bogus DOCTYPE';
2083                     }
2084                 break;
2085
2086                 case 'bogus DOCTYPE':
2087                     /* Consume the next input character: */
2088                     $char = $this->stream->char();
2089
2090                     if ($char === '>') {
2091                         /* U+003E GREATER-THAN SIGN (>)
2092                         Emit the DOCTYPE token. Switch to the data state. */
2093                         $this->emitToken($this->token);
2094                         $state = 'data';
2095
2096                     } elseif ($char === false) {
2097                         /* EOF
2098                         Emit the DOCTYPE token. Reconsume the EOF character in
2099                         the data state. */
2100                         $this->emitToken($this->token);
2101                         $this->stream->unget();
2102                         $state = 'data';
2103
2104                     } else {
2105                         /* Anything else
2106                         Stay in the bogus DOCTYPE state. */
2107                     }
2108                 break;
2109
2110                 // case 'cdataSection':
2111             }
2112         }
2113     }
2114
2115     /**
2116      * Returns a serialized representation of the tree.
2117      *
2118      * @return DOMDocument|DOMNodeList
2119      */
2120     public function save() {
2121         return $this->tree->save();
2122     }
2123
2124     /**
2125      * @return HTML5_TreeBuilder The tree
2126      */
2127     public function getTree()
2128     {
2129         return $this->tree;
2130     }
2131
2132
2133     /**
2134      * Returns the input stream.
2135      *
2136      * @return HTML5_InputStream
2137      */
2138     public function stream() {
2139         return $this->stream;
2140     }
2141
2142     /**
2143      * @param bool $allowed
2144      * @param bool $inattr
2145      * @return string
2146      */
2147     private function consumeCharacterReference($allowed = false, $inattr = false) {
2148         // This goes quite far against spec, and is far closer to the Python
2149         // impl., mainly because we don't do the large unconsuming the spec
2150         // requires.
2151
2152         // All consumed characters.
2153         $chars = $this->stream->char();
2154
2155         /* This section defines how to consume a character
2156         reference. This definition is used when parsing character
2157         references in text and in attributes.
2158
2159         The behavior depends on the identity of the next character
2160         (the one immediately after the U+0026 AMPERSAND character): */
2161
2162         if (
2163             $chars[0] === "\x09" ||
2164             $chars[0] === "\x0A" ||
2165             $chars[0] === "\x0C" ||
2166             $chars[0] === "\x20" ||
2167             $chars[0] === '<' ||
2168             $chars[0] === '&' ||
2169             $chars === false ||
2170             $chars[0] === $allowed
2171         ) {
2172             /* U+0009 CHARACTER TABULATION
2173                U+000A LINE FEED (LF)
2174                U+000C FORM FEED (FF)
2175                U+0020 SPACE
2176                U+003C LESS-THAN SIGN
2177                U+0026 AMPERSAND
2178                EOF
2179                The additional allowed character, if there is one
2180             Not a character reference. No characters are consumed,
2181             and nothing is returned. (This is not an error, either.) */
2182             // We already consumed, so unconsume.
2183             $this->stream->unget();
2184             return '&';
2185         } elseif ($chars[0] === '#') {
2186             /* Consume the U+0023 NUMBER SIGN. */
2187             // Um, yeah, we already did that.
2188             /* The behavior further depends on the character after
2189             the U+0023 NUMBER SIGN: */
2190             $chars .= $this->stream->char();
2191             if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2192                 /* U+0078 LATIN SMALL LETTER X
2193                    U+0058 LATIN CAPITAL LETTER X */
2194                 /* Consume the X. */
2195                 // Um, yeah, we already did that.
2196                 /* Follow the steps below, but using the range of
2197                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2198                 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2199                 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2200                 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2201                 words, 0123456789, ABCDEF, abcdef). */
2202                 $char_class = self::HEX;
2203                 /* When it comes to interpreting the
2204                 number, interpret it as a hexadecimal number. */
2205                 $hex = true;
2206             } else {
2207                 /* Anything else */
2208                 // Unconsume because we shouldn't have consumed this.
2209                 $chars = $chars[0];
2210                 $this->stream->unget();
2211                 /* Follow the steps below, but using the range of
2212                 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2213                 NINE (i.e. just 0123456789). */
2214                 $char_class = self::DIGIT;
2215                 /* When it comes to interpreting the number,
2216                 interpret it as a decimal number. */
2217                 $hex = false;
2218             }
2219
2220             /* Consume as many characters as match the range of characters given above. */
2221             $consumed = $this->stream->charsWhile($char_class);
2222             if ($consumed === '' || $consumed === false) {
2223                 /* If no characters match the range, then don't consume
2224                 any characters (and unconsume the U+0023 NUMBER SIGN
2225                 character and, if appropriate, the X character). This
2226                 is a parse error; nothing is returned. */
2227                 $this->emitToken(array(
2228                     'type' => self::PARSEERROR,
2229                     'data' => 'expected-numeric-entity'
2230                 ));
2231                 return '&' . $chars;
2232             } else {
2233                 /* Otherwise, if the next character is a U+003B SEMICOLON,
2234                 consume that too. If it isn't, there is a parse error. */
2235                 if ($this->stream->char() !== ';') {
2236                     $this->stream->unget();
2237                     $this->emitToken(array(
2238                         'type' => self::PARSEERROR,
2239                         'data' => 'numeric-entity-without-semicolon'
2240                     ));
2241                 }
2242
2243                 /* If one or more characters match the range, then take
2244                 them all and interpret the string of characters as a number
2245                 (either hexadecimal or decimal as appropriate). */
2246                 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2247
2248                 /* If that number is one of the numbers in the first column
2249                 of the following table, then this is a parse error. Find the
2250                 row with that number in the first column, and return a
2251                 character token for the Unicode character given in the
2252                 second column of that row. */
2253                 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2254                 if ($new_codepoint) {
2255                     $this->emitToken(array(
2256                         'type' => self::PARSEERROR,
2257                         'data' => 'illegal-windows-1252-entity'
2258                     ));
2259                     return HTML5_Data::utf8chr($new_codepoint);
2260                 } else {
2261                     /* Otherwise, if the number is greater than 0x10FFFF, then
2262                      * this is a parse error. Return a U+FFFD REPLACEMENT
2263                      * CHARACTER. */
2264                     if ($codepoint > 0x10FFFF) {
2265                         $this->emitToken(array(
2266                             'type' => self::PARSEERROR,
2267                             'data' => 'overlong-character-entity' // XXX probably not correct
2268                         ));
2269                         return "\xEF\xBF\xBD";
2270                     }
2271                     /* Otherwise, return a character token for the Unicode
2272                      * character whose code point is that number.  If the
2273                      * number is in the range 0x0001 to 0x0008,    0x000E to
2274                      * 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2275                      * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2276                      * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2277                      * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2278                      * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2279                      * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2280                      * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2281                      * or 0x10FFFF, then this is a parse error. */
2282                     // && has higher precedence than ||
2283                     if (
2284                         $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2285                         $codepoint === 0x000B ||
2286                         $codepoint >= 0x000E && $codepoint <= 0x001F ||
2287                         $codepoint >= 0x007F && $codepoint <= 0x009F ||
2288                         $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2289                         $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2290                         ($codepoint & 0xFFFE) === 0xFFFE ||
2291                         $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2292                     ) {
2293                         $this->emitToken(array(
2294                             'type' => self::PARSEERROR,
2295                             'data' => 'illegal-codepoint-for-numeric-entity'
2296                         ));
2297                     }
2298                     return HTML5_Data::utf8chr($codepoint);
2299                 }
2300             }
2301         } else {
2302             /* Anything else */
2303
2304             /* Consume the maximum number of characters possible,
2305             with the consumed characters matching one of the
2306             identifiers in the first column of the named character
2307             references table (in a case-sensitive manner). */
2308             // What we actually do here is consume as much as we can while it
2309             // matches the start of one of the identifiers in the first column.
2310
2311             $refs = HTML5_Data::getNamedCharacterReferences();
2312
2313             // Get the longest string which is the start of an identifier
2314             // ($chars) as well as the longest identifier which matches ($id)
2315             // and its codepoint ($codepoint).
2316             $codepoint = false;
2317             $char = $chars;
2318             while ($char !== false && isset($refs[$char])) {
2319                 $refs = $refs[$char];
2320                 if (isset($refs['codepoint'])) {
2321                     $id = $chars;
2322                     $codepoint = $refs['codepoint'];
2323                 }
2324                 $chars .= $char = $this->stream->char();
2325             }
2326
2327             // Unconsume the one character we just took which caused the while
2328             // statement to fail. This could be anything and could cause state
2329             // changes (as if it matches the while loop it must be
2330             // alphanumeric so we can just concat it to whatever we get later).
2331             $this->stream->unget();
2332             if ($char !== false) {
2333                 $chars = substr($chars, 0, -1);
2334             }
2335
2336             /* If no match can be made, then this is a parse error.
2337             No characters are consumed, and nothing is returned. */
2338             if (!$codepoint) {
2339                 $this->emitToken(array(
2340                     'type' => self::PARSEERROR,
2341                     'data' => 'expected-named-entity'
2342                 ));
2343                 return '&' . $chars;
2344             }
2345
2346             /* If the last character matched is not a U+003B SEMICOLON
2347             (;), there is a parse error. */
2348             $semicolon = true;
2349             if (substr($id, -1) !== ';') {
2350                 $this->emitToken(array(
2351                     'type' => self::PARSEERROR,
2352                     'data' => 'named-entity-without-semicolon'
2353                 ));
2354                 $semicolon = false;
2355             }
2356
2357             /* If the character reference is being consumed as part of
2358             an attribute, and the last character matched is not a
2359             U+003B SEMICOLON (;), and the next character is in the
2360             range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2361             LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2362             or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2363             then, for historical reasons, all the characters that were
2364             matched after the U+0026 AMPERSAND (&) must be unconsumed,
2365             and nothing is returned. */
2366             if ($inattr && !$semicolon) {
2367                 // The next character is either the next character in $chars or in the stream.
2368                 if (strlen($chars) > strlen($id)) {
2369                     $next = substr($chars, strlen($id), 1);
2370                 } else {
2371                     $next = $this->stream->char();
2372                     $this->stream->unget();
2373                 }
2374                 if (
2375                     '0' <= $next && $next <= '9' ||
2376                     'A' <= $next && $next <= 'Z' ||
2377                     'a' <= $next && $next <= 'z'
2378                 ) {
2379                     return '&' . $chars;
2380                 }
2381             }
2382
2383             /* Otherwise, return a character token for the character
2384             corresponding to the character reference name (as given
2385             by the second column of the named character references table). */
2386             return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2387         }
2388     }
2389
2390     /**
2391      * @param bool $allowed
2392      */
2393     private function characterReferenceInAttributeValue($allowed = false) {
2394         /* Attempt to consume a character reference. */
2395         $entity = $this->consumeCharacterReference($allowed, true);
2396
2397         /* If nothing is returned, append a U+0026 AMPERSAND
2398         character to the current attribute's value.
2399
2400         Otherwise, append the returned character token to the
2401         current attribute's value. */
2402         $char = (!$entity)
2403             ? '&'
2404             : $entity;
2405
2406         $last = count($this->token['attr']) - 1;
2407         $this->token['attr'][$last]['value'] .= $char;
2408
2409         /* Finally, switch back to the attribute value state that you
2410         were in when were switched into this state. */
2411     }
2412
2413     /**
2414      * Emits a token, passing it on to the tree builder.
2415      *
2416      * @param $token
2417      * @param bool $checkStream
2418      * @param bool $dry
2419      */
2420     protected function emitToken($token, $checkStream = true, $dry = false) {
2421         if ($checkStream === true) {
2422             // Emit errors from input stream.
2423             while ($this->stream->errors) {
2424                 $this->emitToken(array_shift($this->stream->errors), false);
2425             }
2426         }
2427         if ($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2428             for ($i = 0; $i < count($token['attr']); $i++) {
2429                 $this->emitToken(array(
2430                     'type' => self::PARSEERROR,
2431                     'data' => 'attributes-in-end-tag'
2432                 ));
2433             }
2434         }
2435         if ($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2436             $this->emitToken(array(
2437                 'type' => self::PARSEERROR,
2438                 'data' => 'self-closing-flag-on-end-tag',
2439             ));
2440         }
2441         if ($token['type'] === self::STARTTAG) {
2442             // This could be changed to actually pass the tree-builder a hash
2443             $hash = array();
2444             foreach ($token['attr'] as $keypair) {
2445                 if (isset($hash[$keypair['name']])) {
2446                     $this->emitToken(array(
2447                         'type' => self::PARSEERROR,
2448                         'data' => 'duplicate-attribute',
2449                     ));
2450                 } else {
2451                     $hash[$keypair['name']] = $keypair['value'];
2452                 }
2453             }
2454         }
2455
2456         if ($dry === false) {
2457             // the current structure of attributes is not a terribly good one
2458             $this->tree->emitToken($token);
2459         }
2460
2461         if ($dry === false && is_int($this->tree->content_model)) {
2462             $this->content_model = $this->tree->content_model;
2463             $this->tree->content_model = null;
2464
2465         } elseif ($token['type'] === self::ENDTAG) {
2466             $this->content_model = self::PCDATA;
2467         }
2468     }
2469 }
2470