composer package updates
[openemr.git] / vendor / dompdf / dompdf / lib / html5lib / Tokenizer.php
blob46e8504f2e98ff412084e52b1edaf07c07139d0c
1 <?php
3 /*
5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
6 Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
9 Permission is hereby granted, free of charge, to any person obtaining a
10 copy of this software and associated documentation files (the
11 "Software"), to deal in the Software without restriction, including
12 without limitation the rights to use, copy, modify, merge, publish,
13 distribute, sublicense, and/or sell copies of the Software, and to
14 permit persons to whom the Software is furnished to do so, subject to
15 the following conditions:
17 The above copyright notice and this permission notice shall be included
18 in all copies or substantial portions of the Software.
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // Some conventions:
31 // /* */ indicates verbatim text from the HTML 5 specification
32 // // indicates regular comments
34 // all flags are in hyphenated form
36 class HTML5_Tokenizer {
37 /**
38 * @var HTML5_InputStream
40 * Points to an InputStream object.
42 protected $stream;
44 /**
45 * @var HTML5_TreeBuilder
47 * Tree builder that the tokenizer emits token to.
49 private $tree;
51 /**
52 * @var int
54 * Current content model we are parsing as.
56 protected $content_model;
58 /**
59 * Current token that is being built, but not yet emitted. Also
60 * is the last token emitted, if applicable.
62 protected $token;
64 // These are constants describing the content model
65 const PCDATA = 0;
66 const RCDATA = 1;
67 const CDATA = 2;
68 const PLAINTEXT = 3;
70 // These are constants describing tokens
71 // XXX should probably be moved somewhere else, probably the
72 // HTML5 class.
73 const DOCTYPE = 0;
74 const STARTTAG = 1;
75 const ENDTAG = 2;
76 const COMMENT = 3;
77 const CHARACTER = 4;
78 const SPACECHARACTER = 5;
79 const EOF = 6;
80 const PARSEERROR = 7;
82 // These are constants representing bunches of characters.
83 const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
84 const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
85 const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
86 const DIGIT = '0123456789';
87 const HEX = '0123456789ABCDEFabcdef';
88 const WHITESPACE = "\t\n\x0c ";
90 /**
91 * @param $data | Data to parse
92 * @param HTML5_TreeBuilder|null $builder
94 public function __construct($data, $builder = null) {
95 $this->stream = new HTML5_InputStream($data);
96 if (!$builder) {
97 $this->tree = new HTML5_TreeBuilder;
98 } else {
99 $this->tree = $builder;
101 $this->content_model = self::PCDATA;
105 * @param null $context
107 public function parseFragment($context = null) {
108 $this->tree->setupContext($context);
109 if ($this->tree->content_model) {
110 $this->content_model = $this->tree->content_model;
111 $this->tree->content_model = null;
113 $this->parse();
116 // XXX maybe convert this into an iterator? regardless, this function
117 // and the save function should go into a Parser facade of some sort
119 * Performs the actual parsing of the document.
121 public function parse() {
122 // Current state
123 $state = 'data';
124 // This is used to avoid having to have look-behind in the data state.
125 $lastFourChars = '';
127 * Escape flag as specified by the HTML5 specification: "used to
128 * control the behavior of the tokeniser. It is either true or
129 * false, and initially must be set to the false state."
131 $escape = false;
132 //echo "\n\n";
133 while($state !== null) {
135 /*echo $state . ' ';
136 switch ($this->content_model) {
137 case self::PCDATA: echo 'PCDATA'; break;
138 case self::RCDATA: echo 'RCDATA'; break;
139 case self::CDATA: echo 'CDATA'; break;
140 case self::PLAINTEXT: echo 'PLAINTEXT'; break;
142 if ($escape) echo " escape";
143 echo "\n";*/
145 switch($state) {
146 case 'data':
148 /* Consume the next input character */
149 $char = $this->stream->char();
150 $lastFourChars .= $char;
151 if (strlen($lastFourChars) > 4) {
152 $lastFourChars = substr($lastFourChars, -4);
155 // see below for meaning
156 $hyp_cond =
157 !$escape &&
159 $this->content_model === self::RCDATA ||
160 $this->content_model === self::CDATA
162 $amp_cond =
163 !$escape &&
165 $this->content_model === self::PCDATA ||
166 $this->content_model === self::RCDATA
168 $lt_cond =
169 $this->content_model === self::PCDATA ||
172 $this->content_model === self::RCDATA ||
173 $this->content_model === self::CDATA
174 ) &&
175 !$escape
177 $gt_cond =
178 $escape &&
180 $this->content_model === self::RCDATA ||
181 $this->content_model === self::CDATA
184 if ($char === '&' && $amp_cond === true) {
185 /* U+0026 AMPERSAND (&)
186 When the content model flag is set to one of the PCDATA or RCDATA
187 states and the escape flag is false: switch to the
188 character reference data state. Otherwise: treat it as per
189 the "anything else" entry below. */
190 $state = 'character reference data';
192 } elseif (
193 $char === '-' &&
194 $hyp_cond === true &&
195 $lastFourChars === '<!--'
198 U+002D HYPHEN-MINUS (-)
199 If the content model flag is set to either the RCDATA state or
200 the CDATA state, and the escape flag is false, and there are at
201 least three characters before this one in the input stream, and the
202 last four characters in the input stream, including this one, are
203 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
204 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
205 $escape = true;
207 /* In any case, emit the input character as a character token. Stay
208 in the data state. */
209 $this->emitToken(array(
210 'type' => self::CHARACTER,
211 'data' => '-'
213 // We do the "any case" part as part of "anything else".
215 /* U+003C LESS-THAN SIGN (<) */
216 } elseif ($char === '<' && $lt_cond === true) {
217 /* When the content model flag is set to the PCDATA state: switch
218 to the tag open state.
220 When the content model flag is set to either the RCDATA state or
221 the CDATA state and the escape flag is false: switch to the tag
222 open state.
224 Otherwise: treat it as per the "anything else" entry below. */
225 $state = 'tag open';
227 /* U+003E GREATER-THAN SIGN (>) */
228 } elseif (
229 $char === '>' &&
230 $gt_cond === true &&
231 substr($lastFourChars, 1) === '-->'
233 /* If the content model flag is set to either the RCDATA state or
234 the CDATA state, and the escape flag is true, and the last three
235 characters in the input stream including this one are U+002D
236 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
237 set the escape flag to false. */
238 $escape = false;
240 /* In any case, emit the input character as a character token.
241 Stay in the data state. */
242 $this->emitToken(array(
243 'type' => self::CHARACTER,
244 'data' => '>'
246 // We do the "any case" part as part of "anything else".
248 } elseif ($char === false) {
249 /* EOF
250 Emit an end-of-file token. */
251 $state = null;
252 $this->tree->emitToken(array(
253 'type' => self::EOF
256 } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
257 // Directly after emitting a token you switch back to the "data
258 // state". At that point spaceCharacters are important so they are
259 // emitted separately.
260 $chars = $this->stream->charsWhile(self::WHITESPACE);
261 $this->emitToken(array(
262 'type' => self::SPACECHARACTER,
263 'data' => $char . $chars
265 $lastFourChars .= $chars;
266 if (strlen($lastFourChars) > 4) {
267 $lastFourChars = substr($lastFourChars, -4);
269 } else {
270 /* Anything else
271 THIS IS AN OPTIMIZATION: Get as many character that
272 otherwise would also be treated as a character token and emit it
273 as a single character token. Stay in the data state. */
275 $mask = '';
276 if ($hyp_cond === true) {
277 $mask .= '-';
279 if ($amp_cond === true) {
280 $mask .= '&';
282 if ($lt_cond === true) {
283 $mask .= '<';
285 if ($gt_cond === true) {
286 $mask .= '>';
289 if ($mask === '') {
290 $chars = $this->stream->remainingChars();
291 } else {
292 $chars = $this->stream->charsUntil($mask);
295 $this->emitToken(array(
296 'type' => self::CHARACTER,
297 'data' => $char . $chars
300 $lastFourChars .= $chars;
301 if (strlen($lastFourChars) > 4) {
302 $lastFourChars = substr($lastFourChars, -4);
305 $state = 'data';
307 break;
309 case 'character reference data':
310 /* (This cannot happen if the content model flag
311 is set to the CDATA state.) */
313 /* Attempt to consume a character reference, with no
314 additional allowed character. */
315 $entity = $this->consumeCharacterReference();
317 /* If nothing is returned, emit a U+0026 AMPERSAND
318 character token. Otherwise, emit the character token that
319 was returned. */
320 // This is all done when consuming the character reference.
321 $this->emitToken(array(
322 'type' => self::CHARACTER,
323 'data' => $entity
326 /* Finally, switch to the data state. */
327 $state = 'data';
328 break;
330 case 'tag open':
331 $char = $this->stream->char();
333 switch ($this->content_model) {
334 case self::RCDATA:
335 case self::CDATA:
336 /* Consume the next input character. If it is a
337 U+002F SOLIDUS (/) character, switch to the close
338 tag open state. Otherwise, emit a U+003C LESS-THAN
339 SIGN character token and reconsume the current input
340 character in the data state. */
341 // We consumed above.
343 if ($char === '/') {
344 $state = 'close tag open';
345 } else {
346 $this->emitToken(array(
347 'type' => self::CHARACTER,
348 'data' => '<'
351 $this->stream->unget();
353 $state = 'data';
355 break;
357 case self::PCDATA:
358 /* If the content model flag is set to the PCDATA state
359 Consume the next input character: */
360 // We consumed above.
362 if ($char === '!') {
363 /* U+0021 EXCLAMATION MARK (!)
364 Switch to the markup declaration open state. */
365 $state = 'markup declaration open';
367 } elseif ($char === '/') {
368 /* U+002F SOLIDUS (/)
369 Switch to the close tag open state. */
370 $state = 'close tag open';
372 } elseif ('A' <= $char && $char <= 'Z') {
373 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
374 Create a new start tag token, set its tag name to the lowercase
375 version of the input character (add 0x0020 to the character's code
376 point), then switch to the tag name state. (Don't emit the token
377 yet; further details will be filled in before it is emitted.) */
378 $this->token = array(
379 'name' => strtolower($char),
380 'type' => self::STARTTAG,
381 'attr' => array()
384 $state = 'tag name';
386 } elseif ('a' <= $char && $char <= 'z') {
387 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
388 Create a new start tag token, set its tag name to the input
389 character, then switch to the tag name state. (Don't emit
390 the token yet; further details will be filled in before it
391 is emitted.) */
392 $this->token = array(
393 'name' => $char,
394 'type' => self::STARTTAG,
395 'attr' => array()
398 $state = 'tag name';
400 } elseif ($char === '>') {
401 /* U+003E GREATER-THAN SIGN (>)
402 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
403 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
404 $this->emitToken(array(
405 'type' => self::PARSEERROR,
406 'data' => 'expected-tag-name-but-got-right-bracket'
408 $this->emitToken(array(
409 'type' => self::CHARACTER,
410 'data' => '<>'
413 $state = 'data';
415 } elseif ($char === '?') {
416 /* U+003F QUESTION MARK (?)
417 Parse error. Switch to the bogus comment state. */
418 $this->emitToken(array(
419 'type' => self::PARSEERROR,
420 'data' => 'expected-tag-name-but-got-question-mark'
422 $this->token = array(
423 'data' => '?',
424 'type' => self::COMMENT
426 $state = 'bogus comment';
428 } else {
429 /* Anything else
430 Parse error. Emit a U+003C LESS-THAN SIGN character token and
431 reconsume the current input character in the data state. */
432 $this->emitToken(array(
433 'type' => self::PARSEERROR,
434 'data' => 'expected-tag-name'
436 $this->emitToken(array(
437 'type' => self::CHARACTER,
438 'data' => '<'
441 $state = 'data';
442 $this->stream->unget();
444 break;
446 break;
448 case 'close tag open':
449 if (
450 $this->content_model === self::RCDATA ||
451 $this->content_model === self::CDATA
453 /* If the content model flag is set to the RCDATA or CDATA
454 states... */
455 $name = strtolower($this->stream->charsWhile(self::ALPHA));
456 $following = $this->stream->char();
457 $this->stream->unget();
458 if (
459 !$this->token ||
460 $this->token['name'] !== $name ||
461 $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
463 /* if no start tag token has ever been emitted by this instance
464 of the tokenizer (fragment case), or, if the next few
465 characters do not match the tag name of the last start tag
466 token emitted (compared in an ASCII case-insensitive manner),
467 or if they do but they are not immediately followed by one of
468 the following characters:
470 * U+0009 CHARACTER TABULATION
471 * U+000A LINE FEED (LF)
472 * U+000C FORM FEED (FF)
473 * U+0020 SPACE
474 * U+003E GREATER-THAN SIGN (>)
475 * U+002F SOLIDUS (/)
476 * EOF
478 ...then emit a U+003C LESS-THAN SIGN character token, a
479 U+002F SOLIDUS character token, and switch to the data
480 state to process the next input character. */
481 // XXX: Probably ought to replace in_array with $following === x ||...
483 // We also need to emit $name now we've consumed that, as we
484 // know it'll just be emitted as a character token.
485 $this->emitToken(array(
486 'type' => self::CHARACTER,
487 'data' => '</' . $name
490 $state = 'data';
491 } else {
492 // This matches what would happen if we actually did the
493 // otherwise below (but we can't because we've consumed too
494 // much).
496 // Start the end tag token with the name we already have.
497 $this->token = array(
498 'name' => $name,
499 'type' => self::ENDTAG
502 // Change to tag name state.
503 $state = 'tag name';
505 } elseif ($this->content_model === self::PCDATA) {
506 /* Otherwise, if the content model flag is set to the PCDATA
507 state [...]: */
508 $char = $this->stream->char();
510 if ('A' <= $char && $char <= 'Z') {
511 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
512 Create a new end tag token, set its tag name to the lowercase version
513 of the input character (add 0x0020 to the character's code point), then
514 switch to the tag name state. (Don't emit the token yet; further details
515 will be filled in before it is emitted.) */
516 $this->token = array(
517 'name' => strtolower($char),
518 'type' => self::ENDTAG
521 $state = 'tag name';
523 } elseif ('a' <= $char && $char <= 'z') {
524 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
525 Create a new end tag token, set its tag name to the
526 input character, then switch to the tag name state.
527 (Don't emit the token yet; further details will be
528 filled in before it is emitted.) */
529 $this->token = array(
530 'name' => $char,
531 'type' => self::ENDTAG
534 $state = 'tag name';
536 } elseif ($char === '>') {
537 /* U+003E GREATER-THAN SIGN (>)
538 Parse error. Switch to the data state. */
539 $this->emitToken(array(
540 'type' => self::PARSEERROR,
541 'data' => 'expected-closing-tag-but-got-right-bracket'
543 $state = 'data';
545 } elseif ($char === false) {
546 /* EOF
547 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
548 SOLIDUS character token. Reconsume the EOF character in the data state. */
549 $this->emitToken(array(
550 'type' => self::PARSEERROR,
551 'data' => 'expected-closing-tag-but-got-eof'
553 $this->emitToken(array(
554 'type' => self::CHARACTER,
555 'data' => '</'
558 $this->stream->unget();
559 $state = 'data';
561 } else {
562 /* Parse error. Switch to the bogus comment state. */
563 $this->emitToken(array(
564 'type' => self::PARSEERROR,
565 'data' => 'expected-closing-tag-but-got-char'
567 $this->token = array(
568 'data' => $char,
569 'type' => self::COMMENT
571 $state = 'bogus comment';
574 break;
576 case 'tag name':
577 /* Consume the next input character: */
578 $char = $this->stream->char();
580 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
581 /* U+0009 CHARACTER TABULATION
582 U+000A LINE FEED (LF)
583 U+000C FORM FEED (FF)
584 U+0020 SPACE
585 Switch to the before attribute name state. */
586 $state = 'before attribute name';
588 } elseif ($char === '/') {
589 /* U+002F SOLIDUS (/)
590 Switch to the self-closing start tag state. */
591 $state = 'self-closing start tag';
593 } elseif ($char === '>') {
594 /* U+003E GREATER-THAN SIGN (>)
595 Emit the current tag token. Switch to the data state. */
596 $this->emitToken($this->token);
597 $state = 'data';
599 } elseif ('A' <= $char && $char <= 'Z') {
600 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
601 Append the lowercase version of the current input
602 character (add 0x0020 to the character's code point) to
603 the current tag token's tag name. Stay in the tag name state. */
604 $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
606 $this->token['name'] .= strtolower($char . $chars);
607 $state = 'tag name';
609 } elseif ($char === false) {
610 /* EOF
611 Parse error. Reconsume the EOF character in the data state. */
612 $this->emitToken(array(
613 'type' => self::PARSEERROR,
614 'data' => 'eof-in-tag-name'
617 $this->stream->unget();
618 $state = 'data';
620 } else {
621 /* Anything else
622 Append the current input character to the current tag token's tag name.
623 Stay in the tag name state. */
624 $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
626 $this->token['name'] .= $char . $chars;
627 $state = 'tag name';
629 break;
631 case 'before attribute name':
632 /* Consume the next input character: */
633 $char = $this->stream->char();
635 // this conditional is optimized, check bottom
636 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
637 /* U+0009 CHARACTER TABULATION
638 U+000A LINE FEED (LF)
639 U+000C FORM FEED (FF)
640 U+0020 SPACE
641 Stay in the before attribute name state. */
642 $state = 'before attribute name';
644 } elseif ($char === '/') {
645 /* U+002F SOLIDUS (/)
646 Switch to the self-closing start tag state. */
647 $state = 'self-closing start tag';
649 } elseif ($char === '>') {
650 /* U+003E GREATER-THAN SIGN (>)
651 Emit the current tag token. Switch to the data state. */
652 $this->emitToken($this->token);
653 $state = 'data';
655 } elseif ('A' <= $char && $char <= 'Z') {
656 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
657 Start a new attribute in the current tag token. Set that
658 attribute's name to the lowercase version of the current
659 input character (add 0x0020 to the character's code
660 point), and its value to the empty string. Switch to the
661 attribute name state.*/
662 $this->token['attr'][] = array(
663 'name' => strtolower($char),
664 'value' => ''
667 $state = 'attribute name';
669 } elseif ($char === false) {
670 /* EOF
671 Parse error. Reconsume the EOF character in the data state. */
672 $this->emitToken(array(
673 'type' => self::PARSEERROR,
674 'data' => 'expected-attribute-name-but-got-eof'
677 $this->stream->unget();
678 $state = 'data';
680 } else {
681 /* U+0022 QUOTATION MARK (")
682 U+0027 APOSTROPHE (')
683 U+003C LESS-THAN SIGN (<)
684 U+003D EQUALS SIGN (=)
685 Parse error. Treat it as per the "anything else" entry
686 below. */
687 if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
688 $this->emitToken(array(
689 'type' => self::PARSEERROR,
690 'data' => 'invalid-character-in-attribute-name'
694 /* Anything else
695 Start a new attribute in the current tag token. Set that attribute's
696 name to the current input character, and its value to the empty string.
697 Switch to the attribute name state. */
698 $this->token['attr'][] = array(
699 'name' => $char,
700 'value' => ''
703 $state = 'attribute name';
705 break;
707 case 'attribute name':
708 // Consume the next input character:
709 $char = $this->stream->char();
711 // this conditional is optimized, check bottom
712 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
713 /* U+0009 CHARACTER TABULATION
714 U+000A LINE FEED (LF)
715 U+000C FORM FEED (FF)
716 U+0020 SPACE
717 Switch to the after attribute name state. */
718 $state = 'after attribute name';
720 } elseif ($char === '/') {
721 /* U+002F SOLIDUS (/)
722 Switch to the self-closing start tag state. */
723 $state = 'self-closing start tag';
725 } elseif ($char === '=') {
726 /* U+003D EQUALS SIGN (=)
727 Switch to the before attribute value state. */
728 $state = 'before attribute value';
730 } elseif ($char === '>') {
731 /* U+003E GREATER-THAN SIGN (>)
732 Emit the current tag token. Switch to the data state. */
733 $this->emitToken($this->token);
734 $state = 'data';
736 } elseif ('A' <= $char && $char <= 'Z') {
737 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
738 Append the lowercase version of the current input
739 character (add 0x0020 to the character's code point) to
740 the current attribute's name. Stay in the attribute name
741 state. */
742 $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
744 $last = count($this->token['attr']) - 1;
745 $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
747 $state = 'attribute name';
749 } elseif ($char === false) {
750 /* EOF
751 Parse error. Reconsume the EOF character in the data state. */
752 $this->emitToken(array(
753 'type' => self::PARSEERROR,
754 'data' => 'eof-in-attribute-name'
757 $this->stream->unget();
758 $state = 'data';
760 } else {
761 /* U+0022 QUOTATION MARK (")
762 U+0027 APOSTROPHE (')
763 U+003C LESS-THAN SIGN (<)
764 Parse error. Treat it as per the "anything else"
765 entry below. */
766 if ($char === '"' || $char === "'" || $char === '<') {
767 $this->emitToken(array(
768 'type' => self::PARSEERROR,
769 'data' => 'invalid-character-in-attribute-name'
773 /* Anything else
774 Append the current input character to the current attribute's name.
775 Stay in the attribute name state. */
776 $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
778 $last = count($this->token['attr']) - 1;
779 $this->token['attr'][$last]['name'] .= $char . $chars;
781 $state = 'attribute name';
784 /* When the user agent leaves the attribute name state
785 (and before emitting the tag token, if appropriate), the
786 complete attribute's name must be compared to the other
787 attributes on the same token; if there is already an
788 attribute on the token with the exact same name, then this
789 is a parse error and the new attribute must be dropped, along
790 with the value that gets associated with it (if any). */
791 // this might be implemented in the emitToken method
792 break;
794 case 'after attribute name':
795 // Consume the next input character:
796 $char = $this->stream->char();
798 // this is an optimized conditional, check the bottom
799 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
800 /* U+0009 CHARACTER TABULATION
801 U+000A LINE FEED (LF)
802 U+000C FORM FEED (FF)
803 U+0020 SPACE
804 Stay in the after attribute name state. */
805 $state = 'after attribute name';
807 } elseif ($char === '/') {
808 /* U+002F SOLIDUS (/)
809 Switch to the self-closing start tag state. */
810 $state = 'self-closing start tag';
812 } elseif ($char === '=') {
813 /* U+003D EQUALS SIGN (=)
814 Switch to the before attribute value state. */
815 $state = 'before attribute value';
817 } elseif ($char === '>') {
818 /* U+003E GREATER-THAN SIGN (>)
819 Emit the current tag token. Switch to the data state. */
820 $this->emitToken($this->token);
821 $state = 'data';
823 } elseif ('A' <= $char && $char <= 'Z') {
824 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
825 Start a new attribute in the current tag token. Set that
826 attribute's name to the lowercase version of the current
827 input character (add 0x0020 to the character's code
828 point), and its value to the empty string. Switch to the
829 attribute name state. */
830 $this->token['attr'][] = array(
831 'name' => strtolower($char),
832 'value' => ''
835 $state = 'attribute name';
837 } elseif ($char === false) {
838 /* EOF
839 Parse error. Reconsume the EOF character in the data state. */
840 $this->emitToken(array(
841 'type' => self::PARSEERROR,
842 'data' => 'expected-end-of-tag-but-got-eof'
845 $this->stream->unget();
846 $state = 'data';
848 } else {
849 /* U+0022 QUOTATION MARK (")
850 U+0027 APOSTROPHE (')
851 U+003C LESS-THAN SIGN(<)
852 Parse error. Treat it as per the "anything else"
853 entry below. */
854 if ($char === '"' || $char === "'" || $char === "<") {
855 $this->emitToken(array(
856 'type' => self::PARSEERROR,
857 'data' => 'invalid-character-after-attribute-name'
861 /* Anything else
862 Start a new attribute in the current tag token. Set that attribute's
863 name to the current input character, and its value to the empty string.
864 Switch to the attribute name state. */
865 $this->token['attr'][] = array(
866 'name' => $char,
867 'value' => ''
870 $state = 'attribute name';
872 break;
874 case 'before attribute value':
875 // Consume the next input character:
876 $char = $this->stream->char();
878 // this is an optimized conditional
879 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
880 /* U+0009 CHARACTER TABULATION
881 U+000A LINE FEED (LF)
882 U+000C FORM FEED (FF)
883 U+0020 SPACE
884 Stay in the before attribute value state. */
885 $state = 'before attribute value';
887 } elseif ($char === '"') {
888 /* U+0022 QUOTATION MARK (")
889 Switch to the attribute value (double-quoted) state. */
890 $state = 'attribute value (double-quoted)';
892 } elseif ($char === '&') {
893 /* U+0026 AMPERSAND (&)
894 Switch to the attribute value (unquoted) state and reconsume
895 this input character. */
896 $this->stream->unget();
897 $state = 'attribute value (unquoted)';
899 } elseif ($char === '\'') {
900 /* U+0027 APOSTROPHE (')
901 Switch to the attribute value (single-quoted) state. */
902 $state = 'attribute value (single-quoted)';
904 } elseif ($char === '>') {
905 /* U+003E GREATER-THAN SIGN (>)
906 Parse error. Emit the current tag token. Switch to the data state. */
907 $this->emitToken(array(
908 'type' => self::PARSEERROR,
909 'data' => 'expected-attribute-value-but-got-right-bracket'
911 $this->emitToken($this->token);
912 $state = 'data';
914 } elseif ($char === false) {
915 /* EOF
916 Parse error. Reconsume the EOF character in the data state. */
917 $this->emitToken(array(
918 'type' => self::PARSEERROR,
919 'data' => 'expected-attribute-value-but-got-eof'
921 $this->stream->unget();
922 $state = 'data';
924 } else {
925 /* U+003D EQUALS SIGN (=)
926 * U+003C LESS-THAN SIGN (<)
927 Parse error. Treat it as per the "anything else" entry below. */
928 if ($char === '=' || $char === '<') {
929 $this->emitToken(array(
930 'type' => self::PARSEERROR,
931 'data' => 'equals-in-unquoted-attribute-value'
935 /* Anything else
936 Append the current input character to the current attribute's value.
937 Switch to the attribute value (unquoted) state. */
938 $last = count($this->token['attr']) - 1;
939 $this->token['attr'][$last]['value'] .= $char;
941 $state = 'attribute value (unquoted)';
943 break;
945 case 'attribute value (double-quoted)':
946 // Consume the next input character:
947 $char = $this->stream->char();
949 if ($char === '"') {
950 /* U+0022 QUOTATION MARK (")
951 Switch to the after attribute value (quoted) state. */
952 $state = 'after attribute value (quoted)';
954 } elseif ($char === '&') {
955 /* U+0026 AMPERSAND (&)
956 Switch to the character reference in attribute value
957 state, with the additional allowed character
958 being U+0022 QUOTATION MARK ("). */
959 $this->characterReferenceInAttributeValue('"');
961 } elseif ($char === false) {
962 /* EOF
963 Parse error. Reconsume the EOF character in the data state. */
964 $this->emitToken(array(
965 'type' => self::PARSEERROR,
966 'data' => 'eof-in-attribute-value-double-quote'
969 $this->stream->unget();
970 $state = 'data';
972 } else {
973 /* Anything else
974 Append the current input character to the current attribute's value.
975 Stay in the attribute value (double-quoted) state. */
976 $chars = $this->stream->charsUntil('"&');
978 $last = count($this->token['attr']) - 1;
979 $this->token['attr'][$last]['value'] .= $char . $chars;
981 $state = 'attribute value (double-quoted)';
983 break;
985 case 'attribute value (single-quoted)':
986 // Consume the next input character:
987 $char = $this->stream->char();
989 if ($char === "'") {
990 /* U+0022 QUOTATION MARK (')
991 Switch to the after attribute value state. */
992 $state = 'after attribute value (quoted)';
994 } elseif ($char === '&') {
995 /* U+0026 AMPERSAND (&)
996 Switch to the entity in attribute value state. */
997 $this->characterReferenceInAttributeValue("'");
999 } elseif ($char === false) {
1000 /* EOF
1001 Parse error. Reconsume the EOF character in the data state. */
1002 $this->emitToken(array(
1003 'type' => self::PARSEERROR,
1004 'data' => 'eof-in-attribute-value-single-quote'
1007 $this->stream->unget();
1008 $state = 'data';
1010 } else {
1011 /* Anything else
1012 Append the current input character to the current attribute's value.
1013 Stay in the attribute value (single-quoted) state. */
1014 $chars = $this->stream->charsUntil("'&");
1016 $last = count($this->token['attr']) - 1;
1017 $this->token['attr'][$last]['value'] .= $char . $chars;
1019 $state = 'attribute value (single-quoted)';
1021 break;
1023 case 'attribute value (unquoted)':
1024 // Consume the next input character:
1025 $char = $this->stream->char();
1027 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1028 /* U+0009 CHARACTER TABULATION
1029 U+000A LINE FEED (LF)
1030 U+000C FORM FEED (FF)
1031 U+0020 SPACE
1032 Switch to the before attribute name state. */
1033 $state = 'before attribute name';
1035 } elseif ($char === '&') {
1036 /* U+0026 AMPERSAND (&)
1037 Switch to the entity in attribute value state, with the
1038 additional allowed character being U+003E
1039 GREATER-THAN SIGN (>). */
1040 $this->characterReferenceInAttributeValue('>');
1042 } elseif ($char === '>') {
1043 /* U+003E GREATER-THAN SIGN (>)
1044 Emit the current tag token. Switch to the data state. */
1045 $this->emitToken($this->token);
1046 $state = 'data';
1048 } elseif ($char === false) {
1049 /* EOF
1050 Parse error. Reconsume the EOF character in the data state. */
1051 $this->emitToken(array(
1052 'type' => self::PARSEERROR,
1053 'data' => 'eof-in-attribute-value-no-quotes'
1055 $this->stream->unget();
1056 $state = 'data';
1058 } else {
1059 /* U+0022 QUOTATION MARK (")
1060 U+0027 APOSTROPHE (')
1061 U+003C LESS-THAN SIGN (<)
1062 U+003D EQUALS SIGN (=)
1063 Parse error. Treat it as per the "anything else"
1064 entry below. */
1065 if ($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1066 $this->emitToken(array(
1067 'type' => self::PARSEERROR,
1068 'data' => 'unexpected-character-in-unquoted-attribute-value'
1072 /* Anything else
1073 Append the current input character to the current attribute's value.
1074 Stay in the attribute value (unquoted) state. */
1075 $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1077 $last = count($this->token['attr']) - 1;
1078 $this->token['attr'][$last]['value'] .= $char . $chars;
1080 $state = 'attribute value (unquoted)';
1082 break;
1084 case 'after attribute value (quoted)':
1085 /* Consume the next input character: */
1086 $char = $this->stream->char();
1088 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1089 /* U+0009 CHARACTER TABULATION
1090 U+000A LINE FEED (LF)
1091 U+000C FORM FEED (FF)
1092 U+0020 SPACE
1093 Switch to the before attribute name state. */
1094 $state = 'before attribute name';
1096 } elseif ($char === '/') {
1097 /* U+002F SOLIDUS (/)
1098 Switch to the self-closing start tag state. */
1099 $state = 'self-closing start tag';
1101 } elseif ($char === '>') {
1102 /* U+003E GREATER-THAN SIGN (>)
1103 Emit the current tag token. Switch to the data state. */
1104 $this->emitToken($this->token);
1105 $state = 'data';
1107 } elseif ($char === false) {
1108 /* EOF
1109 Parse error. Reconsume the EOF character in the data state. */
1110 $this->emitToken(array(
1111 'type' => self::PARSEERROR,
1112 'data' => 'unexpected-EOF-after-attribute-value'
1114 $this->stream->unget();
1115 $state = 'data';
1117 } else {
1118 /* Anything else
1119 Parse error. Reconsume the character in the before attribute
1120 name state. */
1121 $this->emitToken(array(
1122 'type' => self::PARSEERROR,
1123 'data' => 'unexpected-character-after-attribute-value'
1125 $this->stream->unget();
1126 $state = 'before attribute name';
1128 break;
1130 case 'self-closing start tag':
1131 /* Consume the next input character: */
1132 $char = $this->stream->char();
1134 if ($char === '>') {
1135 /* U+003E GREATER-THAN SIGN (>)
1136 Set the self-closing flag of the current tag token.
1137 Emit the current tag token. Switch to the data state. */
1138 // not sure if this is the name we want
1139 $this->token['self-closing'] = true;
1140 $this->emitToken($this->token);
1141 $state = 'data';
1143 } elseif ($char === false) {
1144 /* EOF
1145 Parse error. Reconsume the EOF character in the data state. */
1146 $this->emitToken(array(
1147 'type' => self::PARSEERROR,
1148 'data' => 'unexpected-eof-after-self-closing'
1150 $this->stream->unget();
1151 $state = 'data';
1153 } else {
1154 /* Anything else
1155 Parse error. Reconsume the character in the before attribute name state. */
1156 $this->emitToken(array(
1157 'type' => self::PARSEERROR,
1158 'data' => 'unexpected-character-after-self-closing'
1160 $this->stream->unget();
1161 $state = 'before attribute name';
1163 break;
1165 case 'bogus comment':
1166 /* (This can only happen if the content model flag is set to the PCDATA state.) */
1167 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1168 character (>) or the end of the file (EOF), whichever comes first. Emit
1169 a comment token whose data is the concatenation of all the characters
1170 starting from and including the character that caused the state machine
1171 to switch into the bogus comment state, up to and including the last
1172 consumed character before the U+003E character, if any, or up to the
1173 end of the file otherwise. (If the comment was started by the end of
1174 the file (EOF), the token is empty.) */
1175 $this->token['data'] .= (string) $this->stream->charsUntil('>');
1176 $this->stream->char();
1178 $this->emitToken($this->token);
1180 /* Switch to the data state. */
1181 $state = 'data';
1182 break;
1184 case 'markup declaration open':
1185 // Consume for below
1186 $hyphens = $this->stream->charsWhile('-', 2);
1187 if ($hyphens === '-') {
1188 $this->stream->unget();
1190 if ($hyphens !== '--') {
1191 $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1194 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1195 characters, consume those two characters, create a comment token whose
1196 data is the empty string, and switch to the comment state. */
1197 if ($hyphens === '--') {
1198 $state = 'comment start';
1199 $this->token = array(
1200 'data' => '',
1201 'type' => self::COMMENT
1204 /* Otherwise if the next seven characters are a case-insensitive match
1205 for the word "DOCTYPE", then consume those characters and switch to the
1206 DOCTYPE state. */
1207 } elseif (strtoupper($alpha) === 'DOCTYPE') {
1208 $state = 'DOCTYPE';
1210 // XXX not implemented
1211 /* Otherwise, if the insertion mode is "in foreign content"
1212 and the current node is not an element in the HTML namespace
1213 and the next seven characters are an ASCII case-sensitive
1214 match for the string "[CDATA[" (the five uppercase letters
1215 "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1216 and after), then consume those characters and switch to the
1217 CDATA section state (which is unrelated to the content model
1218 flag's CDATA state). */
1220 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1221 The next character that is consumed, if any, is the first character
1222 that will be in the comment. */
1223 } else {
1224 $this->emitToken(array(
1225 'type' => self::PARSEERROR,
1226 'data' => 'expected-dashes-or-doctype'
1228 $this->token = array(
1229 'data' => (string) $alpha,
1230 'type' => self::COMMENT
1232 $state = 'bogus comment';
1234 break;
1236 case 'comment start':
1237 /* Consume the next input character: */
1238 $char = $this->stream->char();
1240 if ($char === '-') {
1241 /* U+002D HYPHEN-MINUS (-)
1242 Switch to the comment start dash state. */
1243 $state = 'comment start dash';
1244 } elseif ($char === '>') {
1245 /* U+003E GREATER-THAN SIGN (>)
1246 Parse error. Emit the comment token. Switch to the
1247 data state. */
1248 $this->emitToken(array(
1249 'type' => self::PARSEERROR,
1250 'data' => 'incorrect-comment'
1252 $this->emitToken($this->token);
1253 $state = 'data';
1254 } elseif ($char === false) {
1255 /* EOF
1256 Parse error. Emit the comment token. Reconsume the
1257 EOF character in the data state. */
1258 $this->emitToken(array(
1259 'type' => self::PARSEERROR,
1260 'data' => 'eof-in-comment'
1262 $this->emitToken($this->token);
1263 $this->stream->unget();
1264 $state = 'data';
1265 } else {
1266 /* Anything else
1267 Append the input character to the comment token's
1268 data. Switch to the comment state. */
1269 $this->token['data'] .= $char;
1270 $state = 'comment';
1272 break;
1274 case 'comment start dash':
1275 /* Consume the next input character: */
1276 $char = $this->stream->char();
1277 if ($char === '-') {
1278 /* U+002D HYPHEN-MINUS (-)
1279 Switch to the comment end state */
1280 $state = 'comment end';
1281 } elseif ($char === '>') {
1282 /* U+003E GREATER-THAN SIGN (>)
1283 Parse error. Emit the comment token. Switch to the
1284 data state. */
1285 $this->emitToken(array(
1286 'type' => self::PARSEERROR,
1287 'data' => 'incorrect-comment'
1289 $this->emitToken($this->token);
1290 $state = 'data';
1291 } elseif ($char === false) {
1292 /* Parse error. Emit the comment token. Reconsume the
1293 EOF character in the data state. */
1294 $this->emitToken(array(
1295 'type' => self::PARSEERROR,
1296 'data' => 'eof-in-comment'
1298 $this->emitToken($this->token);
1299 $this->stream->unget();
1300 $state = 'data';
1301 } else {
1302 $this->token['data'] .= '-' . $char;
1303 $state = 'comment';
1305 break;
1307 case 'comment':
1308 /* Consume the next input character: */
1309 $char = $this->stream->char();
1311 if ($char === '-') {
1312 /* U+002D HYPHEN-MINUS (-)
1313 Switch to the comment end dash state */
1314 $state = 'comment end dash';
1316 } elseif ($char === false) {
1317 /* EOF
1318 Parse error. Emit the comment token. Reconsume the EOF character
1319 in the data state. */
1320 $this->emitToken(array(
1321 'type' => self::PARSEERROR,
1322 'data' => 'eof-in-comment'
1324 $this->emitToken($this->token);
1325 $this->stream->unget();
1326 $state = 'data';
1328 } else {
1329 /* Anything else
1330 Append the input character to the comment token's data. Stay in
1331 the comment state. */
1332 $chars = $this->stream->charsUntil('-');
1334 $this->token['data'] .= $char . $chars;
1336 break;
1338 case 'comment end dash':
1339 /* Consume the next input character: */
1340 $char = $this->stream->char();
1342 if ($char === '-') {
1343 /* U+002D HYPHEN-MINUS (-)
1344 Switch to the comment end state */
1345 $state = 'comment end';
1347 } elseif ($char === false) {
1348 /* EOF
1349 Parse error. Emit the comment token. Reconsume the EOF character
1350 in the data state. */
1351 $this->emitToken(array(
1352 'type' => self::PARSEERROR,
1353 'data' => 'eof-in-comment-end-dash'
1355 $this->emitToken($this->token);
1356 $this->stream->unget();
1357 $state = 'data';
1359 } else {
1360 /* Anything else
1361 Append a U+002D HYPHEN-MINUS (-) character and the input
1362 character to the comment token's data. Switch to the comment state. */
1363 $this->token['data'] .= '-'.$char;
1364 $state = 'comment';
1366 break;
1368 case 'comment end':
1369 /* Consume the next input character: */
1370 $char = $this->stream->char();
1372 if ($char === '>') {
1373 /* U+003E GREATER-THAN SIGN (>)
1374 Emit the comment token. Switch to the data state. */
1375 $this->emitToken($this->token);
1376 $state = 'data';
1378 } elseif ($char === '-') {
1379 /* U+002D HYPHEN-MINUS (-)
1380 Parse error. Append a U+002D HYPHEN-MINUS (-) character
1381 to the comment token's data. Stay in the comment end
1382 state. */
1383 $this->emitToken(array(
1384 'type' => self::PARSEERROR,
1385 'data' => 'unexpected-dash-after-double-dash-in-comment'
1387 $this->token['data'] .= '-';
1389 } elseif ($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1390 $this->emitToken(array(
1391 'type' => self::PARSEERROR,
1392 'data' => 'unexpected-space-after-double-dash-in-comment'
1394 $this->token['data'] .= '--' . $char;
1395 $state = 'comment end space';
1397 } elseif ($char === '!') {
1398 $this->emitToken(array(
1399 'type' => self::PARSEERROR,
1400 'data' => 'unexpected-bang-after-double-dash-in-comment'
1402 $state = 'comment end bang';
1404 } elseif ($char === false) {
1405 /* EOF
1406 Parse error. Emit the comment token. Reconsume the
1407 EOF character in the data state. */
1408 $this->emitToken(array(
1409 'type' => self::PARSEERROR,
1410 'data' => 'eof-in-comment-double-dash'
1412 $this->emitToken($this->token);
1413 $this->stream->unget();
1414 $state = 'data';
1416 } else {
1417 /* Anything else
1418 Parse error. Append two U+002D HYPHEN-MINUS (-)
1419 characters and the input character to the comment token's
1420 data. Switch to the comment state. */
1421 $this->emitToken(array(
1422 'type' => self::PARSEERROR,
1423 'data' => 'unexpected-char-in-comment'
1425 $this->token['data'] .= '--'.$char;
1426 $state = 'comment';
1428 break;
1430 case 'comment end bang':
1431 $char = $this->stream->char();
1432 if ($char === '>') {
1433 $this->emitToken($this->token);
1434 $state = 'data';
1435 } elseif ($char === "-") {
1436 $this->token['data'] .= '--!';
1437 $state = 'comment end dash';
1438 } elseif ($char === false) {
1439 $this->emitToken(array(
1440 'type' => self::PARSEERROR,
1441 'data' => 'eof-in-comment-end-bang'
1443 $this->emitToken($this->token);
1444 $this->stream->unget();
1445 $state = 'data';
1446 } else {
1447 $this->token['data'] .= '--!' . $char;
1448 $state = 'comment';
1450 break;
1452 case 'comment end space':
1453 $char = $this->stream->char();
1454 if ($char === '>') {
1455 $this->emitToken($this->token);
1456 $state = 'data';
1457 } elseif ($char === '-') {
1458 $state = 'comment end dash';
1459 } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1460 $this->token['data'] .= $char;
1461 } elseif ($char === false) {
1462 $this->emitToken(array(
1463 'type' => self::PARSEERROR,
1464 'data' => 'unexpected-eof-in-comment-end-space',
1466 $this->emitToken($this->token);
1467 $this->stream->unget();
1468 $state = 'data';
1469 } else {
1470 $this->token['data'] .= $char;
1471 $state = 'comment';
1473 break;
1475 case 'DOCTYPE':
1476 /* Consume the next input character: */
1477 $char = $this->stream->char();
1479 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1480 /* U+0009 CHARACTER TABULATION
1481 U+000A LINE FEED (LF)
1482 U+000C FORM FEED (FF)
1483 U+0020 SPACE
1484 Switch to the before DOCTYPE name state. */
1485 $state = 'before DOCTYPE name';
1487 } elseif ($char === false) {
1488 /* EOF
1489 Parse error. Create a new DOCTYPE token. Set its
1490 force-quirks flag to on. Emit the token. Reconsume the
1491 EOF character in the data state. */
1492 $this->emitToken(array(
1493 'type' => self::PARSEERROR,
1494 'data' => 'need-space-after-doctype-but-got-eof'
1496 $this->emitToken(array(
1497 'name' => '',
1498 'type' => self::DOCTYPE,
1499 'force-quirks' => true,
1500 'error' => true
1502 $this->stream->unget();
1503 $state = 'data';
1505 } else {
1506 /* Anything else
1507 Parse error. Reconsume the current character in the
1508 before DOCTYPE name state. */
1509 $this->emitToken(array(
1510 'type' => self::PARSEERROR,
1511 'data' => 'need-space-after-doctype'
1513 $this->stream->unget();
1514 $state = 'before DOCTYPE name';
1516 break;
1518 case 'before DOCTYPE name':
1519 /* Consume the next input character: */
1520 $char = $this->stream->char();
1522 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1523 /* U+0009 CHARACTER TABULATION
1524 U+000A LINE FEED (LF)
1525 U+000C FORM FEED (FF)
1526 U+0020 SPACE
1527 Stay in the before DOCTYPE name state. */
1529 } elseif ($char === '>') {
1530 /* U+003E GREATER-THAN SIGN (>)
1531 Parse error. Create a new DOCTYPE token. Set its
1532 force-quirks flag to on. Emit the token. Switch to the
1533 data state. */
1534 $this->emitToken(array(
1535 'type' => self::PARSEERROR,
1536 'data' => 'expected-doctype-name-but-got-right-bracket'
1538 $this->emitToken(array(
1539 'name' => '',
1540 'type' => self::DOCTYPE,
1541 'force-quirks' => true,
1542 'error' => true
1545 $state = 'data';
1547 } elseif ('A' <= $char && $char <= 'Z') {
1548 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1549 Create a new DOCTYPE token. Set the token's name to the
1550 lowercase version of the input character (add 0x0020 to
1551 the character's code point). Switch to the DOCTYPE name
1552 state. */
1553 $this->token = array(
1554 'name' => strtolower($char),
1555 'type' => self::DOCTYPE,
1556 'error' => true
1559 $state = 'DOCTYPE name';
1561 } elseif ($char === false) {
1562 /* EOF
1563 Parse error. Create a new DOCTYPE token. Set its
1564 force-quirks flag to on. Emit the token. Reconsume the
1565 EOF character in the data state. */
1566 $this->emitToken(array(
1567 'type' => self::PARSEERROR,
1568 'data' => 'expected-doctype-name-but-got-eof'
1570 $this->emitToken(array(
1571 'name' => '',
1572 'type' => self::DOCTYPE,
1573 'force-quirks' => true,
1574 'error' => true
1577 $this->stream->unget();
1578 $state = 'data';
1580 } else {
1581 /* Anything else
1582 Create a new DOCTYPE token. Set the token's name to the
1583 current input character. Switch to the DOCTYPE name state. */
1584 $this->token = array(
1585 'name' => $char,
1586 'type' => self::DOCTYPE,
1587 'error' => true
1590 $state = 'DOCTYPE name';
1592 break;
1594 case 'DOCTYPE name':
1595 /* Consume the next input character: */
1596 $char = $this->stream->char();
1598 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1599 /* U+0009 CHARACTER TABULATION
1600 U+000A LINE FEED (LF)
1601 U+000C FORM FEED (FF)
1602 U+0020 SPACE
1603 Switch to the after DOCTYPE name state. */
1604 $state = 'after DOCTYPE name';
1606 } elseif ($char === '>') {
1607 /* U+003E GREATER-THAN SIGN (>)
1608 Emit the current DOCTYPE token. Switch to the data state. */
1609 $this->emitToken($this->token);
1610 $state = 'data';
1612 } elseif ('A' <= $char && $char <= 'Z') {
1613 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1614 Append the lowercase version of the input character
1615 (add 0x0020 to the character's code point) to the current
1616 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1617 $this->token['name'] .= strtolower($char);
1619 } elseif ($char === false) {
1620 /* EOF
1621 Parse error. Set the DOCTYPE token's force-quirks flag
1622 to on. Emit that DOCTYPE token. Reconsume the EOF
1623 character in the data state. */
1624 $this->emitToken(array(
1625 'type' => self::PARSEERROR,
1626 'data' => 'eof-in-doctype-name'
1628 $this->token['force-quirks'] = true;
1629 $this->emitToken($this->token);
1630 $this->stream->unget();
1631 $state = 'data';
1633 } else {
1634 /* Anything else
1635 Append the current input character to the current
1636 DOCTYPE token's name. Stay in the DOCTYPE name state. */
1637 $this->token['name'] .= $char;
1640 // XXX this is probably some sort of quirks mode designation,
1641 // check tree-builder to be sure. In general 'error' needs
1642 // to be specc'ified, this probably means removing it at the end
1643 $this->token['error'] = ($this->token['name'] === 'HTML')
1644 ? false
1645 : true;
1646 break;
1648 case 'after DOCTYPE name':
1649 /* Consume the next input character: */
1650 $char = $this->stream->char();
1652 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1653 /* U+0009 CHARACTER TABULATION
1654 U+000A LINE FEED (LF)
1655 U+000C FORM FEED (FF)
1656 U+0020 SPACE
1657 Stay in the after DOCTYPE name state. */
1659 } elseif ($char === '>') {
1660 /* U+003E GREATER-THAN SIGN (>)
1661 Emit the current DOCTYPE token. Switch to the data state. */
1662 $this->emitToken($this->token);
1663 $state = 'data';
1665 } elseif ($char === false) {
1666 /* EOF
1667 Parse error. Set the DOCTYPE token's force-quirks flag
1668 to on. Emit that DOCTYPE token. Reconsume the EOF
1669 character in the data state. */
1670 $this->emitToken(array(
1671 'type' => self::PARSEERROR,
1672 'data' => 'eof-in-doctype'
1674 $this->token['force-quirks'] = true;
1675 $this->emitToken($this->token);
1676 $this->stream->unget();
1677 $state = 'data';
1679 } else {
1680 /* Anything else */
1682 $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1683 if ($nextSix === 'PUBLIC') {
1684 /* If the next six characters are an ASCII
1685 case-insensitive match for the word "PUBLIC", then
1686 consume those characters and switch to the before
1687 DOCTYPE public identifier state. */
1688 $state = 'before DOCTYPE public identifier';
1690 } elseif ($nextSix === 'SYSTEM') {
1691 /* Otherwise, if the next six characters are an ASCII
1692 case-insensitive match for the word "SYSTEM", then
1693 consume those characters and switch to the before
1694 DOCTYPE system identifier state. */
1695 $state = 'before DOCTYPE system identifier';
1697 } else {
1698 /* Otherwise, this is the parse error. Set the DOCTYPE
1699 token's force-quirks flag to on. Switch to the bogus
1700 DOCTYPE state. */
1701 $this->emitToken(array(
1702 'type' => self::PARSEERROR,
1703 'data' => 'expected-space-or-right-bracket-in-doctype'
1705 $this->token['force-quirks'] = true;
1706 $this->token['error'] = true;
1707 $state = 'bogus DOCTYPE';
1710 break;
1712 case 'before DOCTYPE public identifier':
1713 /* Consume the next input character: */
1714 $char = $this->stream->char();
1716 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1717 /* U+0009 CHARACTER TABULATION
1718 U+000A LINE FEED (LF)
1719 U+000C FORM FEED (FF)
1720 U+0020 SPACE
1721 Stay in the before DOCTYPE public identifier state. */
1722 } elseif ($char === '"') {
1723 /* U+0022 QUOTATION MARK (")
1724 Set the DOCTYPE token's public identifier to the empty
1725 string (not missing), then switch to the DOCTYPE public
1726 identifier (double-quoted) state. */
1727 $this->token['public'] = '';
1728 $state = 'DOCTYPE public identifier (double-quoted)';
1729 } elseif ($char === "'") {
1730 /* U+0027 APOSTROPHE (')
1731 Set the DOCTYPE token's public identifier to the empty
1732 string (not missing), then switch to the DOCTYPE public
1733 identifier (single-quoted) state. */
1734 $this->token['public'] = '';
1735 $state = 'DOCTYPE public identifier (single-quoted)';
1736 } elseif ($char === '>') {
1737 /* Parse error. Set the DOCTYPE token's force-quirks flag
1738 to on. Emit that DOCTYPE token. Switch to the data state. */
1739 $this->emitToken(array(
1740 'type' => self::PARSEERROR,
1741 'data' => 'unexpected-end-of-doctype'
1743 $this->token['force-quirks'] = true;
1744 $this->emitToken($this->token);
1745 $state = 'data';
1746 } elseif ($char === false) {
1747 /* Parse error. Set the DOCTYPE token's force-quirks
1748 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1749 character in the data state. */
1750 $this->emitToken(array(
1751 'type' => self::PARSEERROR,
1752 'data' => 'eof-in-doctype'
1754 $this->token['force-quirks'] = true;
1755 $this->emitToken($this->token);
1756 $this->stream->unget();
1757 $state = 'data';
1758 } else {
1759 /* Parse error. Set the DOCTYPE token's force-quirks flag
1760 to on. Switch to the bogus DOCTYPE state. */
1761 $this->emitToken(array(
1762 'type' => self::PARSEERROR,
1763 'data' => 'unexpected-char-in-doctype'
1765 $this->token['force-quirks'] = true;
1766 $state = 'bogus DOCTYPE';
1768 break;
1770 case 'DOCTYPE public identifier (double-quoted)':
1771 /* Consume the next input character: */
1772 $char = $this->stream->char();
1774 if ($char === '"') {
1775 /* U+0022 QUOTATION MARK (")
1776 Switch to the after DOCTYPE public identifier state. */
1777 $state = 'after DOCTYPE public identifier';
1778 } elseif ($char === '>') {
1779 /* U+003E GREATER-THAN SIGN (>)
1780 Parse error. Set the DOCTYPE token's force-quirks flag
1781 to on. Emit that DOCTYPE token. Switch to the data state. */
1782 $this->emitToken(array(
1783 'type' => self::PARSEERROR,
1784 'data' => 'unexpected-end-of-doctype'
1786 $this->token['force-quirks'] = true;
1787 $this->emitToken($this->token);
1788 $state = 'data';
1789 } elseif ($char === false) {
1790 /* EOF
1791 Parse error. Set the DOCTYPE token's force-quirks flag
1792 to on. Emit that DOCTYPE token. Reconsume the EOF
1793 character in the data state. */
1794 $this->emitToken(array(
1795 'type' => self::PARSEERROR,
1796 'data' => 'eof-in-doctype'
1798 $this->token['force-quirks'] = true;
1799 $this->emitToken($this->token);
1800 $this->stream->unget();
1801 $state = 'data';
1802 } else {
1803 /* Anything else
1804 Append the current input character to the current
1805 DOCTYPE token's public identifier. Stay in the DOCTYPE
1806 public identifier (double-quoted) state. */
1807 $this->token['public'] .= $char;
1809 break;
1811 case 'DOCTYPE public identifier (single-quoted)':
1812 /* Consume the next input character: */
1813 $char = $this->stream->char();
1815 if ($char === "'") {
1816 /* U+0027 APOSTROPHE (')
1817 Switch to the after DOCTYPE public identifier state. */
1818 $state = 'after DOCTYPE public identifier';
1819 } elseif ($char === '>') {
1820 /* U+003E GREATER-THAN SIGN (>)
1821 Parse error. Set the DOCTYPE token's force-quirks flag
1822 to on. Emit that DOCTYPE token. Switch to the data state. */
1823 $this->emitToken(array(
1824 'type' => self::PARSEERROR,
1825 'data' => 'unexpected-end-of-doctype'
1827 $this->token['force-quirks'] = true;
1828 $this->emitToken($this->token);
1829 $state = 'data';
1830 } elseif ($char === false) {
1831 /* EOF
1832 Parse error. Set the DOCTYPE token's force-quirks flag
1833 to on. Emit that DOCTYPE token. Reconsume the EOF
1834 character in the data state. */
1835 $this->emitToken(array(
1836 'type' => self::PARSEERROR,
1837 'data' => 'eof-in-doctype'
1839 $this->token['force-quirks'] = true;
1840 $this->emitToken($this->token);
1841 $this->stream->unget();
1842 $state = 'data';
1843 } else {
1844 /* Anything else
1845 Append the current input character to the current
1846 DOCTYPE token's public identifier. Stay in the DOCTYPE
1847 public identifier (double-quoted) state. */
1848 $this->token['public'] .= $char;
1850 break;
1852 case 'after DOCTYPE public identifier':
1853 /* Consume the next input character: */
1854 $char = $this->stream->char();
1856 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1857 /* U+0009 CHARACTER TABULATION
1858 U+000A LINE FEED (LF)
1859 U+000C FORM FEED (FF)
1860 U+0020 SPACE
1861 Stay in the after DOCTYPE public identifier state. */
1862 } elseif ($char === '"') {
1863 /* U+0022 QUOTATION MARK (")
1864 Set the DOCTYPE token's system identifier to the
1865 empty string (not missing), then switch to the DOCTYPE
1866 system identifier (double-quoted) state. */
1867 $this->token['system'] = '';
1868 $state = 'DOCTYPE system identifier (double-quoted)';
1869 } elseif ($char === "'") {
1870 /* U+0027 APOSTROPHE (')
1871 Set the DOCTYPE token's system identifier to the
1872 empty string (not missing), then switch to the DOCTYPE
1873 system identifier (single-quoted) state. */
1874 $this->token['system'] = '';
1875 $state = 'DOCTYPE system identifier (single-quoted)';
1876 } elseif ($char === '>') {
1877 /* U+003E GREATER-THAN SIGN (>)
1878 Emit the current DOCTYPE token. Switch to the data state. */
1879 $this->emitToken($this->token);
1880 $state = 'data';
1881 } elseif ($char === false) {
1882 /* Parse error. Set the DOCTYPE token's force-quirks
1883 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1884 character in the data state. */
1885 $this->emitToken(array(
1886 'type' => self::PARSEERROR,
1887 'data' => 'eof-in-doctype'
1889 $this->token['force-quirks'] = true;
1890 $this->emitToken($this->token);
1891 $this->stream->unget();
1892 $state = 'data';
1893 } else {
1894 /* Anything else
1895 Parse error. Set the DOCTYPE token's force-quirks flag
1896 to on. Switch to the bogus DOCTYPE state. */
1897 $this->emitToken(array(
1898 'type' => self::PARSEERROR,
1899 'data' => 'unexpected-char-in-doctype'
1901 $this->token['force-quirks'] = true;
1902 $state = 'bogus DOCTYPE';
1904 break;
1906 case 'before DOCTYPE system identifier':
1907 /* Consume the next input character: */
1908 $char = $this->stream->char();
1910 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1911 /* U+0009 CHARACTER TABULATION
1912 U+000A LINE FEED (LF)
1913 U+000C FORM FEED (FF)
1914 U+0020 SPACE
1915 Stay in the before DOCTYPE system identifier state. */
1916 } elseif ($char === '"') {
1917 /* U+0022 QUOTATION MARK (")
1918 Set the DOCTYPE token's system identifier to the empty
1919 string (not missing), then switch to the DOCTYPE system
1920 identifier (double-quoted) state. */
1921 $this->token['system'] = '';
1922 $state = 'DOCTYPE system identifier (double-quoted)';
1923 } elseif ($char === "'") {
1924 /* U+0027 APOSTROPHE (')
1925 Set the DOCTYPE token's system identifier to the empty
1926 string (not missing), then switch to the DOCTYPE system
1927 identifier (single-quoted) state. */
1928 $this->token['system'] = '';
1929 $state = 'DOCTYPE system identifier (single-quoted)';
1930 } elseif ($char === '>') {
1931 /* Parse error. Set the DOCTYPE token's force-quirks flag
1932 to on. Emit that DOCTYPE token. Switch to the data state. */
1933 $this->emitToken(array(
1934 'type' => self::PARSEERROR,
1935 'data' => 'unexpected-char-in-doctype'
1937 $this->token['force-quirks'] = true;
1938 $this->emitToken($this->token);
1939 $state = 'data';
1940 } elseif ($char === false) {
1941 /* Parse error. Set the DOCTYPE token's force-quirks
1942 flag to on. Emit that DOCTYPE token. Reconsume the EOF
1943 character in the data state. */
1944 $this->emitToken(array(
1945 'type' => self::PARSEERROR,
1946 'data' => 'eof-in-doctype'
1948 $this->token['force-quirks'] = true;
1949 $this->emitToken($this->token);
1950 $this->stream->unget();
1951 $state = 'data';
1952 } else {
1953 /* Parse error. Set the DOCTYPE token's force-quirks flag
1954 to on. Switch to the bogus DOCTYPE state. */
1955 $this->emitToken(array(
1956 'type' => self::PARSEERROR,
1957 'data' => 'unexpected-char-in-doctype'
1959 $this->token['force-quirks'] = true;
1960 $state = 'bogus DOCTYPE';
1962 break;
1964 case 'DOCTYPE system identifier (double-quoted)':
1965 /* Consume the next input character: */
1966 $char = $this->stream->char();
1968 if ($char === '"') {
1969 /* U+0022 QUOTATION MARK (")
1970 Switch to the after DOCTYPE system identifier state. */
1971 $state = 'after DOCTYPE system identifier';
1972 } elseif ($char === '>') {
1973 /* U+003E GREATER-THAN SIGN (>)
1974 Parse error. Set the DOCTYPE token's force-quirks flag
1975 to on. Emit that DOCTYPE token. Switch to the data state. */
1976 $this->emitToken(array(
1977 'type' => self::PARSEERROR,
1978 'data' => 'unexpected-end-of-doctype'
1980 $this->token['force-quirks'] = true;
1981 $this->emitToken($this->token);
1982 $state = 'data';
1983 } elseif ($char === false) {
1984 /* EOF
1985 Parse error. Set the DOCTYPE token's force-quirks flag
1986 to on. Emit that DOCTYPE token. Reconsume the EOF
1987 character in the data state. */
1988 $this->emitToken(array(
1989 'type' => self::PARSEERROR,
1990 'data' => 'eof-in-doctype'
1992 $this->token['force-quirks'] = true;
1993 $this->emitToken($this->token);
1994 $this->stream->unget();
1995 $state = 'data';
1996 } else {
1997 /* Anything else
1998 Append the current input character to the current
1999 DOCTYPE token's system identifier. Stay in the DOCTYPE
2000 system identifier (double-quoted) state. */
2001 $this->token['system'] .= $char;
2003 break;
2005 case 'DOCTYPE system identifier (single-quoted)':
2006 /* Consume the next input character: */
2007 $char = $this->stream->char();
2009 if ($char === "'") {
2010 /* U+0027 APOSTROPHE (')
2011 Switch to the after DOCTYPE system identifier state. */
2012 $state = 'after DOCTYPE system identifier';
2013 } elseif ($char === '>') {
2014 /* U+003E GREATER-THAN SIGN (>)
2015 Parse error. Set the DOCTYPE token's force-quirks flag
2016 to on. Emit that DOCTYPE token. Switch to the data state. */
2017 $this->emitToken(array(
2018 'type' => self::PARSEERROR,
2019 'data' => 'unexpected-end-of-doctype'
2021 $this->token['force-quirks'] = true;
2022 $this->emitToken($this->token);
2023 $state = 'data';
2024 } elseif ($char === false) {
2025 /* EOF
2026 Parse error. Set the DOCTYPE token's force-quirks flag
2027 to on. Emit that DOCTYPE token. Reconsume the EOF
2028 character in the data state. */
2029 $this->emitToken(array(
2030 'type' => self::PARSEERROR,
2031 'data' => 'eof-in-doctype'
2033 $this->token['force-quirks'] = true;
2034 $this->emitToken($this->token);
2035 $this->stream->unget();
2036 $state = 'data';
2037 } else {
2038 /* Anything else
2039 Append the current input character to the current
2040 DOCTYPE token's system identifier. Stay in the DOCTYPE
2041 system identifier (double-quoted) state. */
2042 $this->token['system'] .= $char;
2044 break;
2046 case 'after DOCTYPE system identifier':
2047 /* Consume the next input character: */
2048 $char = $this->stream->char();
2050 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2051 /* U+0009 CHARACTER TABULATION
2052 U+000A LINE FEED (LF)
2053 U+000C FORM FEED (FF)
2054 U+0020 SPACE
2055 Stay in the after DOCTYPE system identifier state. */
2056 } elseif ($char === '>') {
2057 /* U+003E GREATER-THAN SIGN (>)
2058 Emit the current DOCTYPE token. Switch to the data state. */
2059 $this->emitToken($this->token);
2060 $state = 'data';
2061 } elseif ($char === false) {
2062 /* Parse error. Set the DOCTYPE token's force-quirks
2063 flag to on. Emit that DOCTYPE token. Reconsume the EOF
2064 character in the data state. */
2065 $this->emitToken(array(
2066 'type' => self::PARSEERROR,
2067 'data' => 'eof-in-doctype'
2069 $this->token['force-quirks'] = true;
2070 $this->emitToken($this->token);
2071 $this->stream->unget();
2072 $state = 'data';
2073 } else {
2074 /* Anything else
2075 Parse error. Switch to the bogus DOCTYPE state.
2076 (This does not set the DOCTYPE token's force-quirks
2077 flag to on.) */
2078 $this->emitToken(array(
2079 'type' => self::PARSEERROR,
2080 'data' => 'unexpected-char-in-doctype'
2082 $state = 'bogus DOCTYPE';
2084 break;
2086 case 'bogus DOCTYPE':
2087 /* Consume the next input character: */
2088 $char = $this->stream->char();
2090 if ($char === '>') {
2091 /* U+003E GREATER-THAN SIGN (>)
2092 Emit the DOCTYPE token. Switch to the data state. */
2093 $this->emitToken($this->token);
2094 $state = 'data';
2096 } elseif ($char === false) {
2097 /* EOF
2098 Emit the DOCTYPE token. Reconsume the EOF character in
2099 the data state. */
2100 $this->emitToken($this->token);
2101 $this->stream->unget();
2102 $state = 'data';
2104 } else {
2105 /* Anything else
2106 Stay in the bogus DOCTYPE state. */
2108 break;
2110 // case 'cdataSection':
2116 * Returns a serialized representation of the tree.
2118 * @return DOMDocument|DOMNodeList
2120 public function save() {
2121 return $this->tree->save();
2125 * @return HTML5_TreeBuilder The tree
2127 public function getTree()
2129 return $this->tree;
2134 * Returns the input stream.
2136 * @return HTML5_InputStream
2138 public function stream() {
2139 return $this->stream;
2143 * @param bool $allowed
2144 * @param bool $inattr
2145 * @return string
2147 private function consumeCharacterReference($allowed = false, $inattr = false) {
2148 // This goes quite far against spec, and is far closer to the Python
2149 // impl., mainly because we don't do the large unconsuming the spec
2150 // requires.
2152 // All consumed characters.
2153 $chars = $this->stream->char();
2155 /* This section defines how to consume a character
2156 reference. This definition is used when parsing character
2157 references in text and in attributes.
2159 The behavior depends on the identity of the next character
2160 (the one immediately after the U+0026 AMPERSAND character): */
2162 if (
2163 $chars[0] === "\x09" ||
2164 $chars[0] === "\x0A" ||
2165 $chars[0] === "\x0C" ||
2166 $chars[0] === "\x20" ||
2167 $chars[0] === '<' ||
2168 $chars[0] === '&' ||
2169 $chars === false ||
2170 $chars[0] === $allowed
2172 /* U+0009 CHARACTER TABULATION
2173 U+000A LINE FEED (LF)
2174 U+000C FORM FEED (FF)
2175 U+0020 SPACE
2176 U+003C LESS-THAN SIGN
2177 U+0026 AMPERSAND
2179 The additional allowed character, if there is one
2180 Not a character reference. No characters are consumed,
2181 and nothing is returned. (This is not an error, either.) */
2182 // We already consumed, so unconsume.
2183 $this->stream->unget();
2184 return '&';
2185 } elseif ($chars[0] === '#') {
2186 /* Consume the U+0023 NUMBER SIGN. */
2187 // Um, yeah, we already did that.
2188 /* The behavior further depends on the character after
2189 the U+0023 NUMBER SIGN: */
2190 $chars .= $this->stream->char();
2191 if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2192 /* U+0078 LATIN SMALL LETTER X
2193 U+0058 LATIN CAPITAL LETTER X */
2194 /* Consume the X. */
2195 // Um, yeah, we already did that.
2196 /* Follow the steps below, but using the range of
2197 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2198 NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2199 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2200 A, through to U+0046 LATIN CAPITAL LETTER F (in other
2201 words, 0123456789, ABCDEF, abcdef). */
2202 $char_class = self::HEX;
2203 /* When it comes to interpreting the
2204 number, interpret it as a hexadecimal number. */
2205 $hex = true;
2206 } else {
2207 /* Anything else */
2208 // Unconsume because we shouldn't have consumed this.
2209 $chars = $chars[0];
2210 $this->stream->unget();
2211 /* Follow the steps below, but using the range of
2212 characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2213 NINE (i.e. just 0123456789). */
2214 $char_class = self::DIGIT;
2215 /* When it comes to interpreting the number,
2216 interpret it as a decimal number. */
2217 $hex = false;
2220 /* Consume as many characters as match the range of characters given above. */
2221 $consumed = $this->stream->charsWhile($char_class);
2222 if ($consumed === '' || $consumed === false) {
2223 /* If no characters match the range, then don't consume
2224 any characters (and unconsume the U+0023 NUMBER SIGN
2225 character and, if appropriate, the X character). This
2226 is a parse error; nothing is returned. */
2227 $this->emitToken(array(
2228 'type' => self::PARSEERROR,
2229 'data' => 'expected-numeric-entity'
2231 return '&' . $chars;
2232 } else {
2233 /* Otherwise, if the next character is a U+003B SEMICOLON,
2234 consume that too. If it isn't, there is a parse error. */
2235 if ($this->stream->char() !== ';') {
2236 $this->stream->unget();
2237 $this->emitToken(array(
2238 'type' => self::PARSEERROR,
2239 'data' => 'numeric-entity-without-semicolon'
2243 /* If one or more characters match the range, then take
2244 them all and interpret the string of characters as a number
2245 (either hexadecimal or decimal as appropriate). */
2246 $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2248 /* If that number is one of the numbers in the first column
2249 of the following table, then this is a parse error. Find the
2250 row with that number in the first column, and return a
2251 character token for the Unicode character given in the
2252 second column of that row. */
2253 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2254 if ($new_codepoint) {
2255 $this->emitToken(array(
2256 'type' => self::PARSEERROR,
2257 'data' => 'illegal-windows-1252-entity'
2259 return HTML5_Data::utf8chr($new_codepoint);
2260 } else {
2261 /* Otherwise, if the number is greater than 0x10FFFF, then
2262 * this is a parse error. Return a U+FFFD REPLACEMENT
2263 * CHARACTER. */
2264 if ($codepoint > 0x10FFFF) {
2265 $this->emitToken(array(
2266 'type' => self::PARSEERROR,
2267 'data' => 'overlong-character-entity' // XXX probably not correct
2269 return "\xEF\xBF\xBD";
2271 /* Otherwise, return a character token for the Unicode
2272 * character whose code point is that number. If the
2273 * number is in the range 0x0001 to 0x0008, 0x000E to
2274 * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2275 * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2276 * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2277 * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2278 * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2279 * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2280 * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2281 * or 0x10FFFF, then this is a parse error. */
2282 // && has higher precedence than ||
2283 if (
2284 $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2285 $codepoint === 0x000B ||
2286 $codepoint >= 0x000E && $codepoint <= 0x001F ||
2287 $codepoint >= 0x007F && $codepoint <= 0x009F ||
2288 $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2289 $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2290 ($codepoint & 0xFFFE) === 0xFFFE ||
2291 $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2293 $this->emitToken(array(
2294 'type' => self::PARSEERROR,
2295 'data' => 'illegal-codepoint-for-numeric-entity'
2298 return HTML5_Data::utf8chr($codepoint);
2301 } else {
2302 /* Anything else */
2304 /* Consume the maximum number of characters possible,
2305 with the consumed characters matching one of the
2306 identifiers in the first column of the named character
2307 references table (in a case-sensitive manner). */
2308 // What we actually do here is consume as much as we can while it
2309 // matches the start of one of the identifiers in the first column.
2311 $refs = HTML5_Data::getNamedCharacterReferences();
2313 // Get the longest string which is the start of an identifier
2314 // ($chars) as well as the longest identifier which matches ($id)
2315 // and its codepoint ($codepoint).
2316 $codepoint = false;
2317 $char = $chars;
2318 while ($char !== false && isset($refs[$char])) {
2319 $refs = $refs[$char];
2320 if (isset($refs['codepoint'])) {
2321 $id = $chars;
2322 $codepoint = $refs['codepoint'];
2324 $chars .= $char = $this->stream->char();
2327 // Unconsume the one character we just took which caused the while
2328 // statement to fail. This could be anything and could cause state
2329 // changes (as if it matches the while loop it must be
2330 // alphanumeric so we can just concat it to whatever we get later).
2331 $this->stream->unget();
2332 if ($char !== false) {
2333 $chars = substr($chars, 0, -1);
2336 /* If no match can be made, then this is a parse error.
2337 No characters are consumed, and nothing is returned. */
2338 if (!$codepoint) {
2339 $this->emitToken(array(
2340 'type' => self::PARSEERROR,
2341 'data' => 'expected-named-entity'
2343 return '&' . $chars;
2346 /* If the last character matched is not a U+003B SEMICOLON
2347 (;), there is a parse error. */
2348 $semicolon = true;
2349 if (substr($id, -1) !== ';') {
2350 $this->emitToken(array(
2351 'type' => self::PARSEERROR,
2352 'data' => 'named-entity-without-semicolon'
2354 $semicolon = false;
2357 /* If the character reference is being consumed as part of
2358 an attribute, and the last character matched is not a
2359 U+003B SEMICOLON (;), and the next character is in the
2360 range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2361 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2362 or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2363 then, for historical reasons, all the characters that were
2364 matched after the U+0026 AMPERSAND (&) must be unconsumed,
2365 and nothing is returned. */
2366 if ($inattr && !$semicolon) {
2367 // The next character is either the next character in $chars or in the stream.
2368 if (strlen($chars) > strlen($id)) {
2369 $next = substr($chars, strlen($id), 1);
2370 } else {
2371 $next = $this->stream->char();
2372 $this->stream->unget();
2374 if (
2375 '0' <= $next && $next <= '9' ||
2376 'A' <= $next && $next <= 'Z' ||
2377 'a' <= $next && $next <= 'z'
2379 return '&' . $chars;
2383 /* Otherwise, return a character token for the character
2384 corresponding to the character reference name (as given
2385 by the second column of the named character references table). */
2386 return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2391 * @param bool $allowed
2393 private function characterReferenceInAttributeValue($allowed = false) {
2394 /* Attempt to consume a character reference. */
2395 $entity = $this->consumeCharacterReference($allowed, true);
2397 /* If nothing is returned, append a U+0026 AMPERSAND
2398 character to the current attribute's value.
2400 Otherwise, append the returned character token to the
2401 current attribute's value. */
2402 $char = (!$entity)
2403 ? '&'
2404 : $entity;
2406 $last = count($this->token['attr']) - 1;
2407 $this->token['attr'][$last]['value'] .= $char;
2409 /* Finally, switch back to the attribute value state that you
2410 were in when were switched into this state. */
2414 * Emits a token, passing it on to the tree builder.
2416 * @param $token
2417 * @param bool $checkStream
2418 * @param bool $dry
2420 protected function emitToken($token, $checkStream = true, $dry = false) {
2421 if ($checkStream === true) {
2422 // Emit errors from input stream.
2423 while ($this->stream->errors) {
2424 $this->emitToken(array_shift($this->stream->errors), false);
2427 if ($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2428 for ($i = 0; $i < count($token['attr']); $i++) {
2429 $this->emitToken(array(
2430 'type' => self::PARSEERROR,
2431 'data' => 'attributes-in-end-tag'
2435 if ($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2436 $this->emitToken(array(
2437 'type' => self::PARSEERROR,
2438 'data' => 'self-closing-flag-on-end-tag',
2441 if ($token['type'] === self::STARTTAG) {
2442 // This could be changed to actually pass the tree-builder a hash
2443 $hash = array();
2444 foreach ($token['attr'] as $keypair) {
2445 if (isset($hash[$keypair['name']])) {
2446 $this->emitToken(array(
2447 'type' => self::PARSEERROR,
2448 'data' => 'duplicate-attribute',
2450 } else {
2451 $hash[$keypair['name']] = $keypair['value'];
2456 if ($dry === false) {
2457 // the current structure of attributes is not a terribly good one
2458 $this->tree->emitToken($token);
2461 if ($dry === false && is_int($this->tree->content_model)) {
2462 $this->content_model = $this->tree->content_model;
2463 $this->tree->content_model = null;
2465 } elseif ($token['type'] === self::ENDTAG) {
2466 $this->content_model = self::PCDATA;