Don't truncate in DOMLex when seeing closing div
[htmlpurifier.git] / library / HTMLPurifier / Lexer / PH5P.php
blobff4fa218fba247a4329324ee7cc920f5f45437bd
1 <?php
3 /**
4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
7 * @note
8 * Recent changes to PHP's DOM extension have resulted in some fatal
9 * error conditions with the original version of PH5P. Pending changes,
10 * this lexer will punt to DirectLex if DOM throws an exception.
13 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
15 /**
16 * @param string $html
17 * @param HTMLPurifier_Config $config
18 * @param HTMLPurifier_Context $context
19 * @return HTMLPurifier_Token[]
21 public function tokenizeHTML($html, $config, $context)
23 $new_html = $this->normalize($html, $config, $context);
24 $new_html = $this->wrapHTML($new_html, $config, $context);
25 try {
26 $parser = new HTML5($new_html);
27 $doc = $parser->save();
28 } catch (DOMException $e) {
29 // Uh oh, it failed. Punt to DirectLex.
30 $lexer = new HTMLPurifier_Lexer_DirectLex();
31 $context->register('PH5PError', $e); // save the error, so we can detect it
32 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
34 $tokens = array();
35 $this->tokenizeDOM(
36 $doc->getElementsByTagName('html')->item(0)-> // <html>
37 getElementsByTagName('body')->item(0) // <body>
39 $tokens
41 return $tokens;
47 Copyright 2007 Jeroen van der Meer <http://jero.net/>
49 Permission is hereby granted, free of charge, to any person obtaining a
50 copy of this software and associated documentation files (the
51 "Software"), to deal in the Software without restriction, including
52 without limitation the rights to use, copy, modify, merge, publish,
53 distribute, sublicense, and/or sell copies of the Software, and to
54 permit persons to whom the Software is furnished to do so, subject to
55 the following conditions:
57 The above copyright notice and this permission notice shall be included
58 in all copies or substantial portions of the Software.
60 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
61 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
70 class HTML5
72 private $data;
73 private $char;
74 private $EOF;
75 private $state;
76 private $tree;
77 private $token;
78 private $content_model;
79 private $escape = false;
80 private $entities = array(
81 'AElig;',
82 'AElig',
83 'AMP;',
84 'AMP',
85 'Aacute;',
86 'Aacute',
87 'Acirc;',
88 'Acirc',
89 'Agrave;',
90 'Agrave',
91 'Alpha;',
92 'Aring;',
93 'Aring',
94 'Atilde;',
95 'Atilde',
96 'Auml;',
97 'Auml',
98 'Beta;',
99 'COPY;',
100 'COPY',
101 'Ccedil;',
102 'Ccedil',
103 'Chi;',
104 'Dagger;',
105 'Delta;',
106 'ETH;',
107 'ETH',
108 'Eacute;',
109 'Eacute',
110 'Ecirc;',
111 'Ecirc',
112 'Egrave;',
113 'Egrave',
114 'Epsilon;',
115 'Eta;',
116 'Euml;',
117 'Euml',
118 'GT;',
119 'GT',
120 'Gamma;',
121 'Iacute;',
122 'Iacute',
123 'Icirc;',
124 'Icirc',
125 'Igrave;',
126 'Igrave',
127 'Iota;',
128 'Iuml;',
129 'Iuml',
130 'Kappa;',
131 'LT;',
132 'LT',
133 'Lambda;',
134 'Mu;',
135 'Ntilde;',
136 'Ntilde',
137 'Nu;',
138 'OElig;',
139 'Oacute;',
140 'Oacute',
141 'Ocirc;',
142 'Ocirc',
143 'Ograve;',
144 'Ograve',
145 'Omega;',
146 'Omicron;',
147 'Oslash;',
148 'Oslash',
149 'Otilde;',
150 'Otilde',
151 'Ouml;',
152 'Ouml',
153 'Phi;',
154 'Pi;',
155 'Prime;',
156 'Psi;',
157 'QUOT;',
158 'QUOT',
159 'REG;',
160 'REG',
161 'Rho;',
162 'Scaron;',
163 'Sigma;',
164 'THORN;',
165 'THORN',
166 'TRADE;',
167 'Tau;',
168 'Theta;',
169 'Uacute;',
170 'Uacute',
171 'Ucirc;',
172 'Ucirc',
173 'Ugrave;',
174 'Ugrave',
175 'Upsilon;',
176 'Uuml;',
177 'Uuml',
178 'Xi;',
179 'Yacute;',
180 'Yacute',
181 'Yuml;',
182 'Zeta;',
183 'aacute;',
184 'aacute',
185 'acirc;',
186 'acirc',
187 'acute;',
188 'acute',
189 'aelig;',
190 'aelig',
191 'agrave;',
192 'agrave',
193 'alefsym;',
194 'alpha;',
195 'amp;',
196 'amp',
197 'and;',
198 'ang;',
199 'apos;',
200 'aring;',
201 'aring',
202 'asymp;',
203 'atilde;',
204 'atilde',
205 'auml;',
206 'auml',
207 'bdquo;',
208 'beta;',
209 'brvbar;',
210 'brvbar',
211 'bull;',
212 'cap;',
213 'ccedil;',
214 'ccedil',
215 'cedil;',
216 'cedil',
217 'cent;',
218 'cent',
219 'chi;',
220 'circ;',
221 'clubs;',
222 'cong;',
223 'copy;',
224 'copy',
225 'crarr;',
226 'cup;',
227 'curren;',
228 'curren',
229 'dArr;',
230 'dagger;',
231 'darr;',
232 'deg;',
233 'deg',
234 'delta;',
235 'diams;',
236 'divide;',
237 'divide',
238 'eacute;',
239 'eacute',
240 'ecirc;',
241 'ecirc',
242 'egrave;',
243 'egrave',
244 'empty;',
245 'emsp;',
246 'ensp;',
247 'epsilon;',
248 'equiv;',
249 'eta;',
250 'eth;',
251 'eth',
252 'euml;',
253 'euml',
254 'euro;',
255 'exist;',
256 'fnof;',
257 'forall;',
258 'frac12;',
259 'frac12',
260 'frac14;',
261 'frac14',
262 'frac34;',
263 'frac34',
264 'frasl;',
265 'gamma;',
266 'ge;',
267 'gt;',
268 'gt',
269 'hArr;',
270 'harr;',
271 'hearts;',
272 'hellip;',
273 'iacute;',
274 'iacute',
275 'icirc;',
276 'icirc',
277 'iexcl;',
278 'iexcl',
279 'igrave;',
280 'igrave',
281 'image;',
282 'infin;',
283 'int;',
284 'iota;',
285 'iquest;',
286 'iquest',
287 'isin;',
288 'iuml;',
289 'iuml',
290 'kappa;',
291 'lArr;',
292 'lambda;',
293 'lang;',
294 'laquo;',
295 'laquo',
296 'larr;',
297 'lceil;',
298 'ldquo;',
299 'le;',
300 'lfloor;',
301 'lowast;',
302 'loz;',
303 'lrm;',
304 'lsaquo;',
305 'lsquo;',
306 'lt;',
307 'lt',
308 'macr;',
309 'macr',
310 'mdash;',
311 'micro;',
312 'micro',
313 'middot;',
314 'middot',
315 'minus;',
316 'mu;',
317 'nabla;',
318 'nbsp;',
319 'nbsp',
320 'ndash;',
321 'ne;',
322 'ni;',
323 'not;',
324 'not',
325 'notin;',
326 'nsub;',
327 'ntilde;',
328 'ntilde',
329 'nu;',
330 'oacute;',
331 'oacute',
332 'ocirc;',
333 'ocirc',
334 'oelig;',
335 'ograve;',
336 'ograve',
337 'oline;',
338 'omega;',
339 'omicron;',
340 'oplus;',
341 'or;',
342 'ordf;',
343 'ordf',
344 'ordm;',
345 'ordm',
346 'oslash;',
347 'oslash',
348 'otilde;',
349 'otilde',
350 'otimes;',
351 'ouml;',
352 'ouml',
353 'para;',
354 'para',
355 'part;',
356 'permil;',
357 'perp;',
358 'phi;',
359 'pi;',
360 'piv;',
361 'plusmn;',
362 'plusmn',
363 'pound;',
364 'pound',
365 'prime;',
366 'prod;',
367 'prop;',
368 'psi;',
369 'quot;',
370 'quot',
371 'rArr;',
372 'radic;',
373 'rang;',
374 'raquo;',
375 'raquo',
376 'rarr;',
377 'rceil;',
378 'rdquo;',
379 'real;',
380 'reg;',
381 'reg',
382 'rfloor;',
383 'rho;',
384 'rlm;',
385 'rsaquo;',
386 'rsquo;',
387 'sbquo;',
388 'scaron;',
389 'sdot;',
390 'sect;',
391 'sect',
392 'shy;',
393 'shy',
394 'sigma;',
395 'sigmaf;',
396 'sim;',
397 'spades;',
398 'sub;',
399 'sube;',
400 'sum;',
401 'sup1;',
402 'sup1',
403 'sup2;',
404 'sup2',
405 'sup3;',
406 'sup3',
407 'sup;',
408 'supe;',
409 'szlig;',
410 'szlig',
411 'tau;',
412 'there4;',
413 'theta;',
414 'thetasym;',
415 'thinsp;',
416 'thorn;',
417 'thorn',
418 'tilde;',
419 'times;',
420 'times',
421 'trade;',
422 'uArr;',
423 'uacute;',
424 'uacute',
425 'uarr;',
426 'ucirc;',
427 'ucirc',
428 'ugrave;',
429 'ugrave',
430 'uml;',
431 'uml',
432 'upsih;',
433 'upsilon;',
434 'uuml;',
435 'uuml',
436 'weierp;',
437 'xi;',
438 'yacute;',
439 'yacute',
440 'yen;',
441 'yen',
442 'yuml;',
443 'yuml',
444 'zeta;',
445 'zwj;',
446 'zwnj;'
449 const PCDATA = 0;
450 const RCDATA = 1;
451 const CDATA = 2;
452 const PLAINTEXT = 3;
454 const DOCTYPE = 0;
455 const STARTTAG = 1;
456 const ENDTAG = 2;
457 const COMMENT = 3;
458 const CHARACTR = 4;
459 const EOF = 5;
461 public function __construct($data)
463 $this->data = $data;
464 $this->char = -1;
465 $this->EOF = strlen($data);
466 $this->tree = new HTML5TreeConstructer;
467 $this->content_model = self::PCDATA;
469 $this->state = 'data';
471 while ($this->state !== null) {
472 $this->{$this->state . 'State'}();
476 public function save()
478 return $this->tree->save();
481 private function char()
483 return ($this->char < $this->EOF)
484 ? $this->data[$this->char]
485 : false;
488 private function character($s, $l = 0)
490 if ($s + $l < $this->EOF) {
491 if ($l === 0) {
492 return $this->data[$s];
493 } else {
494 return substr($this->data, $s, $l);
499 private function characters($char_class, $start)
501 return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
504 private function dataState()
506 // Consume the next input character
507 $this->char++;
508 $char = $this->char();
510 if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511 /* U+0026 AMPERSAND (&)
512 When the content model flag is set to one of the PCDATA or RCDATA
513 states: switch to the entity data state. Otherwise: treat it as per
514 the "anything else" entry below. */
515 $this->state = 'entityData';
517 } elseif ($char === '-') {
518 /* If the content model flag is set to either the RCDATA state or
519 the CDATA state, and the escape flag is false, and there are at
520 least three characters before this one in the input stream, and the
521 last four characters in the input stream, including this one, are
522 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524 if (($this->content_model === self::RCDATA || $this->content_model ===
525 self::CDATA) && $this->escape === false &&
526 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
528 $this->escape = true;
531 /* In any case, emit the input character as a character token. Stay
532 in the data state. */
533 $this->emitToken(
534 array(
535 'type' => self::CHARACTR,
536 'data' => $char
540 /* U+003C LESS-THAN SIGN (<) */
541 } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542 (($this->content_model === self::RCDATA ||
543 $this->content_model === self::CDATA) && $this->escape === false))
545 /* When the content model flag is set to the PCDATA state: switch
546 to the tag open state.
548 When the content model flag is set to either the RCDATA state or
549 the CDATA state and the escape flag is false: switch to the tag
550 open state.
552 Otherwise: treat it as per the "anything else" entry below. */
553 $this->state = 'tagOpen';
555 /* U+003E GREATER-THAN SIGN (>) */
556 } elseif ($char === '>') {
557 /* If the content model flag is set to either the RCDATA state or
558 the CDATA state, and the escape flag is true, and the last three
559 characters in the input stream including this one are U+002D
560 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561 set the escape flag to false. */
562 if (($this->content_model === self::RCDATA ||
563 $this->content_model === self::CDATA) && $this->escape === true &&
564 $this->character($this->char, 3) === '-->'
566 $this->escape = false;
569 /* In any case, emit the input character as a character token.
570 Stay in the data state. */
571 $this->emitToken(
572 array(
573 'type' => self::CHARACTR,
574 'data' => $char
578 } elseif ($this->char === $this->EOF) {
579 /* EOF
580 Emit an end-of-file token. */
581 $this->EOF();
583 } elseif ($this->content_model === self::PLAINTEXT) {
584 /* When the content model flag is set to the PLAINTEXT state
585 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586 the text and emit it as a character token. */
587 $this->emitToken(
588 array(
589 'type' => self::CHARACTR,
590 'data' => substr($this->data, $this->char)
594 $this->EOF();
596 } else {
597 /* Anything else
598 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599 otherwise would also be treated as a character token and emit it
600 as a single character token. Stay in the data state. */
601 $len = strcspn($this->data, '<&', $this->char);
602 $char = substr($this->data, $this->char, $len);
603 $this->char += $len - 1;
605 $this->emitToken(
606 array(
607 'type' => self::CHARACTR,
608 'data' => $char
612 $this->state = 'data';
616 private function entityDataState()
618 // Attempt to consume an entity.
619 $entity = $this->entity();
621 // If nothing is returned, emit a U+0026 AMPERSAND character token.
622 // Otherwise, emit the character token that was returned.
623 $char = (!$entity) ? '&' : $entity;
624 $this->emitToken(
625 array(
626 'type' => self::CHARACTR,
627 'data' => $char
631 // Finally, switch to the data state.
632 $this->state = 'data';
635 private function tagOpenState()
637 switch ($this->content_model) {
638 case self::RCDATA:
639 case self::CDATA:
640 /* If the next input character is a U+002F SOLIDUS (/) character,
641 consume it and switch to the close tag open state. If the next
642 input character is not a U+002F SOLIDUS (/) character, emit a
643 U+003C LESS-THAN SIGN character token and switch to the data
644 state to process the next input character. */
645 if ($this->character($this->char + 1) === '/') {
646 $this->char++;
647 $this->state = 'closeTagOpen';
649 } else {
650 $this->emitToken(
651 array(
652 'type' => self::CHARACTR,
653 'data' => '<'
657 $this->state = 'data';
659 break;
661 case self::PCDATA:
662 // If the content model flag is set to the PCDATA state
663 // Consume the next input character:
664 $this->char++;
665 $char = $this->char();
667 if ($char === '!') {
668 /* U+0021 EXCLAMATION MARK (!)
669 Switch to the markup declaration open state. */
670 $this->state = 'markupDeclarationOpen';
672 } elseif ($char === '/') {
673 /* U+002F SOLIDUS (/)
674 Switch to the close tag open state. */
675 $this->state = 'closeTagOpen';
677 } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679 Create a new start tag token, set its tag name to the lowercase
680 version of the input character (add 0x0020 to the character's code
681 point), then switch to the tag name state. (Don't emit the token
682 yet; further details will be filled in before it is emitted.) */
683 $this->token = array(
684 'name' => strtolower($char),
685 'type' => self::STARTTAG,
686 'attr' => array()
689 $this->state = 'tagName';
691 } elseif ($char === '>') {
692 /* U+003E GREATER-THAN SIGN (>)
693 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695 $this->emitToken(
696 array(
697 'type' => self::CHARACTR,
698 'data' => '<>'
702 $this->state = 'data';
704 } elseif ($char === '?') {
705 /* U+003F QUESTION MARK (?)
706 Parse error. Switch to the bogus comment state. */
707 $this->state = 'bogusComment';
709 } else {
710 /* Anything else
711 Parse error. Emit a U+003C LESS-THAN SIGN character token and
712 reconsume the current input character in the data state. */
713 $this->emitToken(
714 array(
715 'type' => self::CHARACTR,
716 'data' => '<'
720 $this->char--;
721 $this->state = 'data';
723 break;
727 private function closeTagOpenState()
729 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
732 if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733 (!$the_same || ($the_same && (!preg_match(
734 '/[\t\n\x0b\x0c >\/]/',
735 $this->character($this->char + 1 + strlen($next_node))
736 ) || $this->EOF === $this->char)))
738 /* If the content model flag is set to the RCDATA or CDATA states then
739 examine the next few characters. If they do not match the tag name of
740 the last start tag token emitted (case insensitively), or if they do but
741 they are not immediately followed by one of the following characters:
742 * U+0009 CHARACTER TABULATION
743 * U+000A LINE FEED (LF)
744 * U+000B LINE TABULATION
745 * U+000C FORM FEED (FF)
746 * U+0020 SPACE
747 * U+003E GREATER-THAN SIGN (>)
748 * U+002F SOLIDUS (/)
749 * EOF
750 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751 token, a U+002F SOLIDUS character token, and switch to the data state
752 to process the next input character. */
753 $this->emitToken(
754 array(
755 'type' => self::CHARACTR,
756 'data' => '</'
760 $this->state = 'data';
762 } else {
763 /* Otherwise, if the content model flag is set to the PCDATA state,
764 or if the next few characters do match that tag name, consume the
765 next input character: */
766 $this->char++;
767 $char = $this->char();
769 if (preg_match('/^[A-Za-z]$/', $char)) {
770 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771 Create a new end tag token, set its tag name to the lowercase version
772 of the input character (add 0x0020 to the character's code point), then
773 switch to the tag name state. (Don't emit the token yet; further details
774 will be filled in before it is emitted.) */
775 $this->token = array(
776 'name' => strtolower($char),
777 'type' => self::ENDTAG
780 $this->state = 'tagName';
782 } elseif ($char === '>') {
783 /* U+003E GREATER-THAN SIGN (>)
784 Parse error. Switch to the data state. */
785 $this->state = 'data';
787 } elseif ($this->char === $this->EOF) {
788 /* EOF
789 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790 SOLIDUS character token. Reconsume the EOF character in the data state. */
791 $this->emitToken(
792 array(
793 'type' => self::CHARACTR,
794 'data' => '</'
798 $this->char--;
799 $this->state = 'data';
801 } else {
802 /* Parse error. Switch to the bogus comment state. */
803 $this->state = 'bogusComment';
808 private function tagNameState()
810 // Consume the next input character:
811 $this->char++;
812 $char = $this->character($this->char);
814 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815 /* U+0009 CHARACTER TABULATION
816 U+000A LINE FEED (LF)
817 U+000B LINE TABULATION
818 U+000C FORM FEED (FF)
819 U+0020 SPACE
820 Switch to the before attribute name state. */
821 $this->state = 'beforeAttributeName';
823 } elseif ($char === '>') {
824 /* U+003E GREATER-THAN SIGN (>)
825 Emit the current tag token. Switch to the data state. */
826 $this->emitToken($this->token);
827 $this->state = 'data';
829 } elseif ($this->char === $this->EOF) {
830 /* EOF
831 Parse error. Emit the current tag token. Reconsume the EOF
832 character in the data state. */
833 $this->emitToken($this->token);
835 $this->char--;
836 $this->state = 'data';
838 } elseif ($char === '/') {
839 /* U+002F SOLIDUS (/)
840 Parse error unless this is a permitted slash. Switch to the before
841 attribute name state. */
842 $this->state = 'beforeAttributeName';
844 } else {
845 /* Anything else
846 Append the current input character to the current tag token's tag name.
847 Stay in the tag name state. */
848 $this->token['name'] .= strtolower($char);
849 $this->state = 'tagName';
853 private function beforeAttributeNameState()
855 // Consume the next input character:
856 $this->char++;
857 $char = $this->character($this->char);
859 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860 /* U+0009 CHARACTER TABULATION
861 U+000A LINE FEED (LF)
862 U+000B LINE TABULATION
863 U+000C FORM FEED (FF)
864 U+0020 SPACE
865 Stay in the before attribute name state. */
866 $this->state = 'beforeAttributeName';
868 } elseif ($char === '>') {
869 /* U+003E GREATER-THAN SIGN (>)
870 Emit the current tag token. Switch to the data state. */
871 $this->emitToken($this->token);
872 $this->state = 'data';
874 } elseif ($char === '/') {
875 /* U+002F SOLIDUS (/)
876 Parse error unless this is a permitted slash. Stay in the before
877 attribute name state. */
878 $this->state = 'beforeAttributeName';
880 } elseif ($this->char === $this->EOF) {
881 /* EOF
882 Parse error. Emit the current tag token. Reconsume the EOF
883 character in the data state. */
884 $this->emitToken($this->token);
886 $this->char--;
887 $this->state = 'data';
889 } else {
890 /* Anything else
891 Start a new attribute in the current tag token. Set that attribute's
892 name to the current input character, and its value to the empty string.
893 Switch to the attribute name state. */
894 $this->token['attr'][] = array(
895 'name' => strtolower($char),
896 'value' => null
899 $this->state = 'attributeName';
903 private function attributeNameState()
905 // Consume the next input character:
906 $this->char++;
907 $char = $this->character($this->char);
909 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910 /* U+0009 CHARACTER TABULATION
911 U+000A LINE FEED (LF)
912 U+000B LINE TABULATION
913 U+000C FORM FEED (FF)
914 U+0020 SPACE
915 Stay in the before attribute name state. */
916 $this->state = 'afterAttributeName';
918 } elseif ($char === '=') {
919 /* U+003D EQUALS SIGN (=)
920 Switch to the before attribute value state. */
921 $this->state = 'beforeAttributeValue';
923 } elseif ($char === '>') {
924 /* U+003E GREATER-THAN SIGN (>)
925 Emit the current tag token. Switch to the data state. */
926 $this->emitToken($this->token);
927 $this->state = 'data';
929 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930 /* U+002F SOLIDUS (/)
931 Parse error unless this is a permitted slash. Switch to the before
932 attribute name state. */
933 $this->state = 'beforeAttributeName';
935 } elseif ($this->char === $this->EOF) {
936 /* EOF
937 Parse error. Emit the current tag token. Reconsume the EOF
938 character in the data state. */
939 $this->emitToken($this->token);
941 $this->char--;
942 $this->state = 'data';
944 } else {
945 /* Anything else
946 Append the current input character to the current attribute's name.
947 Stay in the attribute name state. */
948 $last = count($this->token['attr']) - 1;
949 $this->token['attr'][$last]['name'] .= strtolower($char);
951 $this->state = 'attributeName';
955 private function afterAttributeNameState()
957 // Consume the next input character:
958 $this->char++;
959 $char = $this->character($this->char);
961 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962 /* U+0009 CHARACTER TABULATION
963 U+000A LINE FEED (LF)
964 U+000B LINE TABULATION
965 U+000C FORM FEED (FF)
966 U+0020 SPACE
967 Stay in the after attribute name state. */
968 $this->state = 'afterAttributeName';
970 } elseif ($char === '=') {
971 /* U+003D EQUALS SIGN (=)
972 Switch to the before attribute value state. */
973 $this->state = 'beforeAttributeValue';
975 } elseif ($char === '>') {
976 /* U+003E GREATER-THAN SIGN (>)
977 Emit the current tag token. Switch to the data state. */
978 $this->emitToken($this->token);
979 $this->state = 'data';
981 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982 /* U+002F SOLIDUS (/)
983 Parse error unless this is a permitted slash. Switch to the
984 before attribute name state. */
985 $this->state = 'beforeAttributeName';
987 } elseif ($this->char === $this->EOF) {
988 /* EOF
989 Parse error. Emit the current tag token. Reconsume the EOF
990 character in the data state. */
991 $this->emitToken($this->token);
993 $this->char--;
994 $this->state = 'data';
996 } else {
997 /* Anything else
998 Start a new attribute in the current tag token. Set that attribute's
999 name to the current input character, and its value to the empty string.
1000 Switch to the attribute name state. */
1001 $this->token['attr'][] = array(
1002 'name' => strtolower($char),
1003 'value' => null
1006 $this->state = 'attributeName';
1010 private function beforeAttributeValueState()
1012 // Consume the next input character:
1013 $this->char++;
1014 $char = $this->character($this->char);
1016 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017 /* U+0009 CHARACTER TABULATION
1018 U+000A LINE FEED (LF)
1019 U+000B LINE TABULATION
1020 U+000C FORM FEED (FF)
1021 U+0020 SPACE
1022 Stay in the before attribute value state. */
1023 $this->state = 'beforeAttributeValue';
1025 } elseif ($char === '"') {
1026 /* U+0022 QUOTATION MARK (")
1027 Switch to the attribute value (double-quoted) state. */
1028 $this->state = 'attributeValueDoubleQuoted';
1030 } elseif ($char === '&') {
1031 /* U+0026 AMPERSAND (&)
1032 Switch to the attribute value (unquoted) state and reconsume
1033 this input character. */
1034 $this->char--;
1035 $this->state = 'attributeValueUnquoted';
1037 } elseif ($char === '\'') {
1038 /* U+0027 APOSTROPHE (')
1039 Switch to the attribute value (single-quoted) state. */
1040 $this->state = 'attributeValueSingleQuoted';
1042 } elseif ($char === '>') {
1043 /* U+003E GREATER-THAN SIGN (>)
1044 Emit the current tag token. Switch to the data state. */
1045 $this->emitToken($this->token);
1046 $this->state = 'data';
1048 } else {
1049 /* Anything else
1050 Append the current input character to the current attribute's value.
1051 Switch to the attribute value (unquoted) state. */
1052 $last = count($this->token['attr']) - 1;
1053 $this->token['attr'][$last]['value'] .= $char;
1055 $this->state = 'attributeValueUnquoted';
1059 private function attributeValueDoubleQuotedState()
1061 // Consume the next input character:
1062 $this->char++;
1063 $char = $this->character($this->char);
1065 if ($char === '"') {
1066 /* U+0022 QUOTATION MARK (")
1067 Switch to the before attribute name state. */
1068 $this->state = 'beforeAttributeName';
1070 } elseif ($char === '&') {
1071 /* U+0026 AMPERSAND (&)
1072 Switch to the entity in attribute value state. */
1073 $this->entityInAttributeValueState('double');
1075 } elseif ($this->char === $this->EOF) {
1076 /* EOF
1077 Parse error. Emit the current tag token. Reconsume the character
1078 in the data state. */
1079 $this->emitToken($this->token);
1081 $this->char--;
1082 $this->state = 'data';
1084 } else {
1085 /* Anything else
1086 Append the current input character to the current attribute's value.
1087 Stay in the attribute value (double-quoted) state. */
1088 $last = count($this->token['attr']) - 1;
1089 $this->token['attr'][$last]['value'] .= $char;
1091 $this->state = 'attributeValueDoubleQuoted';
1095 private function attributeValueSingleQuotedState()
1097 // Consume the next input character:
1098 $this->char++;
1099 $char = $this->character($this->char);
1101 if ($char === '\'') {
1102 /* U+0022 QUOTATION MARK (')
1103 Switch to the before attribute name state. */
1104 $this->state = 'beforeAttributeName';
1106 } elseif ($char === '&') {
1107 /* U+0026 AMPERSAND (&)
1108 Switch to the entity in attribute value state. */
1109 $this->entityInAttributeValueState('single');
1111 } elseif ($this->char === $this->EOF) {
1112 /* EOF
1113 Parse error. Emit the current tag token. Reconsume the character
1114 in the data state. */
1115 $this->emitToken($this->token);
1117 $this->char--;
1118 $this->state = 'data';
1120 } else {
1121 /* Anything else
1122 Append the current input character to the current attribute's value.
1123 Stay in the attribute value (single-quoted) state. */
1124 $last = count($this->token['attr']) - 1;
1125 $this->token['attr'][$last]['value'] .= $char;
1127 $this->state = 'attributeValueSingleQuoted';
1131 private function attributeValueUnquotedState()
1133 // Consume the next input character:
1134 $this->char++;
1135 $char = $this->character($this->char);
1137 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138 /* U+0009 CHARACTER TABULATION
1139 U+000A LINE FEED (LF)
1140 U+000B LINE TABULATION
1141 U+000C FORM FEED (FF)
1142 U+0020 SPACE
1143 Switch to the before attribute name state. */
1144 $this->state = 'beforeAttributeName';
1146 } elseif ($char === '&') {
1147 /* U+0026 AMPERSAND (&)
1148 Switch to the entity in attribute value state. */
1149 $this->entityInAttributeValueState();
1151 } elseif ($char === '>') {
1152 /* U+003E GREATER-THAN SIGN (>)
1153 Emit the current tag token. Switch to the data state. */
1154 $this->emitToken($this->token);
1155 $this->state = 'data';
1157 } else {
1158 /* Anything else
1159 Append the current input character to the current attribute's value.
1160 Stay in the attribute value (unquoted) state. */
1161 $last = count($this->token['attr']) - 1;
1162 $this->token['attr'][$last]['value'] .= $char;
1164 $this->state = 'attributeValueUnquoted';
1168 private function entityInAttributeValueState()
1170 // Attempt to consume an entity.
1171 $entity = $this->entity();
1173 // If nothing is returned, append a U+0026 AMPERSAND character to the
1174 // current attribute's value. Otherwise, emit the character token that
1175 // was returned.
1176 $char = (!$entity)
1177 ? '&'
1178 : $entity;
1180 $last = count($this->token['attr']) - 1;
1181 $this->token['attr'][$last]['value'] .= $char;
1184 private function bogusCommentState()
1186 /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187 character (>) or the end of the file (EOF), whichever comes first. Emit
1188 a comment token whose data is the concatenation of all the characters
1189 starting from and including the character that caused the state machine
1190 to switch into the bogus comment state, up to and including the last
1191 consumed character before the U+003E character, if any, or up to the
1192 end of the file otherwise. (If the comment was started by the end of
1193 the file (EOF), the token is empty.) */
1194 $data = $this->characters('^>', $this->char);
1195 $this->emitToken(
1196 array(
1197 'data' => $data,
1198 'type' => self::COMMENT
1202 $this->char += strlen($data);
1204 /* Switch to the data state. */
1205 $this->state = 'data';
1207 /* If the end of the file was reached, reconsume the EOF character. */
1208 if ($this->char === $this->EOF) {
1209 $this->char = $this->EOF - 1;
1213 private function markupDeclarationOpenState()
1215 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216 characters, consume those two characters, create a comment token whose
1217 data is the empty string, and switch to the comment state. */
1218 if ($this->character($this->char + 1, 2) === '--') {
1219 $this->char += 2;
1220 $this->state = 'comment';
1221 $this->token = array(
1222 'data' => null,
1223 'type' => self::COMMENT
1226 /* Otherwise if the next seven chacacters are a case-insensitive match
1227 for the word "DOCTYPE", then consume those characters and switch to the
1228 DOCTYPE state. */
1229 } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230 $this->char += 7;
1231 $this->state = 'doctype';
1233 /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234 The next character that is consumed, if any, is the first character
1235 that will be in the comment. */
1236 } else {
1237 $this->char++;
1238 $this->state = 'bogusComment';
1242 private function commentState()
1244 /* Consume the next input character: */
1245 $this->char++;
1246 $char = $this->char();
1248 /* U+002D HYPHEN-MINUS (-) */
1249 if ($char === '-') {
1250 /* Switch to the comment dash state */
1251 $this->state = 'commentDash';
1253 /* EOF */
1254 } elseif ($this->char === $this->EOF) {
1255 /* Parse error. Emit the comment token. Reconsume the EOF character
1256 in the data state. */
1257 $this->emitToken($this->token);
1258 $this->char--;
1259 $this->state = 'data';
1261 /* Anything else */
1262 } else {
1263 /* Append the input character to the comment token's data. Stay in
1264 the comment state. */
1265 $this->token['data'] .= $char;
1269 private function commentDashState()
1271 /* Consume the next input character: */
1272 $this->char++;
1273 $char = $this->char();
1275 /* U+002D HYPHEN-MINUS (-) */
1276 if ($char === '-') {
1277 /* Switch to the comment end state */
1278 $this->state = 'commentEnd';
1280 /* EOF */
1281 } elseif ($this->char === $this->EOF) {
1282 /* Parse error. Emit the comment token. Reconsume the EOF character
1283 in the data state. */
1284 $this->emitToken($this->token);
1285 $this->char--;
1286 $this->state = 'data';
1288 /* Anything else */
1289 } else {
1290 /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291 character to the comment token's data. Switch to the comment state. */
1292 $this->token['data'] .= '-' . $char;
1293 $this->state = 'comment';
1297 private function commentEndState()
1299 /* Consume the next input character: */
1300 $this->char++;
1301 $char = $this->char();
1303 if ($char === '>') {
1304 $this->emitToken($this->token);
1305 $this->state = 'data';
1307 } elseif ($char === '-') {
1308 $this->token['data'] .= '-';
1310 } elseif ($this->char === $this->EOF) {
1311 $this->emitToken($this->token);
1312 $this->char--;
1313 $this->state = 'data';
1315 } else {
1316 $this->token['data'] .= '--' . $char;
1317 $this->state = 'comment';
1321 private function doctypeState()
1323 /* Consume the next input character: */
1324 $this->char++;
1325 $char = $this->char();
1327 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328 $this->state = 'beforeDoctypeName';
1330 } else {
1331 $this->char--;
1332 $this->state = 'beforeDoctypeName';
1336 private function beforeDoctypeNameState()
1338 /* Consume the next input character: */
1339 $this->char++;
1340 $char = $this->char();
1342 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343 // Stay in the before DOCTYPE name state.
1345 } elseif (preg_match('/^[a-z]$/', $char)) {
1346 $this->token = array(
1347 'name' => strtoupper($char),
1348 'type' => self::DOCTYPE,
1349 'error' => true
1352 $this->state = 'doctypeName';
1354 } elseif ($char === '>') {
1355 $this->emitToken(
1356 array(
1357 'name' => null,
1358 'type' => self::DOCTYPE,
1359 'error' => true
1363 $this->state = 'data';
1365 } elseif ($this->char === $this->EOF) {
1366 $this->emitToken(
1367 array(
1368 'name' => null,
1369 'type' => self::DOCTYPE,
1370 'error' => true
1374 $this->char--;
1375 $this->state = 'data';
1377 } else {
1378 $this->token = array(
1379 'name' => $char,
1380 'type' => self::DOCTYPE,
1381 'error' => true
1384 $this->state = 'doctypeName';
1388 private function doctypeNameState()
1390 /* Consume the next input character: */
1391 $this->char++;
1392 $char = $this->char();
1394 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395 $this->state = 'AfterDoctypeName';
1397 } elseif ($char === '>') {
1398 $this->emitToken($this->token);
1399 $this->state = 'data';
1401 } elseif (preg_match('/^[a-z]$/', $char)) {
1402 $this->token['name'] .= strtoupper($char);
1404 } elseif ($this->char === $this->EOF) {
1405 $this->emitToken($this->token);
1406 $this->char--;
1407 $this->state = 'data';
1409 } else {
1410 $this->token['name'] .= $char;
1413 $this->token['error'] = ($this->token['name'] === 'HTML')
1414 ? false
1415 : true;
1418 private function afterDoctypeNameState()
1420 /* Consume the next input character: */
1421 $this->char++;
1422 $char = $this->char();
1424 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425 // Stay in the DOCTYPE name state.
1427 } elseif ($char === '>') {
1428 $this->emitToken($this->token);
1429 $this->state = 'data';
1431 } elseif ($this->char === $this->EOF) {
1432 $this->emitToken($this->token);
1433 $this->char--;
1434 $this->state = 'data';
1436 } else {
1437 $this->token['error'] = true;
1438 $this->state = 'bogusDoctype';
1442 private function bogusDoctypeState()
1444 /* Consume the next input character: */
1445 $this->char++;
1446 $char = $this->char();
1448 if ($char === '>') {
1449 $this->emitToken($this->token);
1450 $this->state = 'data';
1452 } elseif ($this->char === $this->EOF) {
1453 $this->emitToken($this->token);
1454 $this->char--;
1455 $this->state = 'data';
1457 } else {
1458 // Stay in the bogus DOCTYPE state.
1462 private function entity()
1464 $start = $this->char;
1466 // This section defines how to consume an entity. This definition is
1467 // used when parsing entities in text and in attributes.
1469 // The behaviour depends on the identity of the next character (the
1470 // one immediately after the U+0026 AMPERSAND character):
1472 switch ($this->character($this->char + 1)) {
1473 // U+0023 NUMBER SIGN (#)
1474 case '#':
1476 // The behaviour further depends on the character after the
1477 // U+0023 NUMBER SIGN:
1478 switch ($this->character($this->char + 1)) {
1479 // U+0078 LATIN SMALL LETTER X
1480 // U+0058 LATIN CAPITAL LETTER X
1481 case 'x':
1482 case 'X':
1483 // Follow the steps below, but using the range of
1484 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488 // words, 0-9, A-F, a-f).
1489 $char = 1;
1490 $char_class = '0-9A-Fa-f';
1491 break;
1493 // Anything else
1494 default:
1495 // Follow the steps below, but using the range of
1496 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497 // NINE (i.e. just 0-9).
1498 $char = 0;
1499 $char_class = '0-9';
1500 break;
1503 // Consume as many characters as match the range of characters
1504 // given above.
1505 $this->char++;
1506 $e_name = $this->characters($char_class, $this->char + $char + 1);
1507 $entity = $this->character($start, $this->char);
1508 $cond = strlen($e_name) > 0;
1510 // The rest of the parsing happens bellow.
1511 break;
1513 // Anything else
1514 default:
1515 // Consume the maximum number of characters possible, with the
1516 // consumed characters case-sensitively matching one of the
1517 // identifiers in the first column of the entities table.
1518 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1519 $len = strlen($e_name);
1521 for ($c = 1; $c <= $len; $c++) {
1522 $id = substr($e_name, 0, $c);
1523 $this->char++;
1525 if (in_array($id, $this->entities)) {
1526 if ($e_name[$c - 1] !== ';') {
1527 if ($c < $len && $e_name[$c] == ';') {
1528 $this->char++; // consume extra semicolon
1531 $entity = $id;
1532 break;
1536 $cond = isset($entity);
1537 // The rest of the parsing happens bellow.
1538 break;
1541 if (!$cond) {
1542 // If no match can be made, then this is a parse error. No
1543 // characters are consumed, and nothing is returned.
1544 $this->char = $start;
1545 return false;
1548 // Return a character token for the character corresponding to the
1549 // entity name (as given by the second column of the entities table).
1550 return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1553 private function emitToken($token)
1555 $emit = $this->tree->emitToken($token);
1557 if (is_int($emit)) {
1558 $this->content_model = $emit;
1560 } elseif ($token['type'] === self::ENDTAG) {
1561 $this->content_model = self::PCDATA;
1565 private function EOF()
1567 $this->state = null;
1568 $this->tree->emitToken(
1569 array(
1570 'type' => self::EOF
1576 class HTML5TreeConstructer
1578 public $stack = array();
1580 private $phase;
1581 private $mode;
1582 private $dom;
1583 private $foster_parent = null;
1584 private $a_formatting = array();
1586 private $head_pointer = null;
1587 private $form_pointer = null;
1589 private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1590 private $formatting = array(
1591 'a',
1592 'b',
1593 'big',
1594 'em',
1595 'font',
1596 'i',
1597 'nobr',
1598 's',
1599 'small',
1600 'strike',
1601 'strong',
1602 'tt',
1605 private $special = array(
1606 'address',
1607 'area',
1608 'base',
1609 'basefont',
1610 'bgsound',
1611 'blockquote',
1612 'body',
1613 'br',
1614 'center',
1615 'col',
1616 'colgroup',
1617 'dd',
1618 'dir',
1619 'div',
1620 'dl',
1621 'dt',
1622 'embed',
1623 'fieldset',
1624 'form',
1625 'frame',
1626 'frameset',
1627 'h1',
1628 'h2',
1629 'h3',
1630 'h4',
1631 'h5',
1632 'h6',
1633 'head',
1634 'hr',
1635 'iframe',
1636 'image',
1637 'img',
1638 'input',
1639 'isindex',
1640 'li',
1641 'link',
1642 'listing',
1643 'menu',
1644 'meta',
1645 'noembed',
1646 'noframes',
1647 'noscript',
1648 'ol',
1649 'optgroup',
1650 'option',
1651 'p',
1652 'param',
1653 'plaintext',
1654 'pre',
1655 'script',
1656 'select',
1657 'spacer',
1658 'style',
1659 'tbody',
1660 'textarea',
1661 'tfoot',
1662 'thead',
1663 'title',
1664 'tr',
1665 'ul',
1666 'wbr'
1669 // The different phases.
1670 const INIT_PHASE = 0;
1671 const ROOT_PHASE = 1;
1672 const MAIN_PHASE = 2;
1673 const END_PHASE = 3;
1675 // The different insertion modes for the main phase.
1676 const BEFOR_HEAD = 0;
1677 const IN_HEAD = 1;
1678 const AFTER_HEAD = 2;
1679 const IN_BODY = 3;
1680 const IN_TABLE = 4;
1681 const IN_CAPTION = 5;
1682 const IN_CGROUP = 6;
1683 const IN_TBODY = 7;
1684 const IN_ROW = 8;
1685 const IN_CELL = 9;
1686 const IN_SELECT = 10;
1687 const AFTER_BODY = 11;
1688 const IN_FRAME = 12;
1689 const AFTR_FRAME = 13;
1691 // The different types of elements.
1692 const SPECIAL = 0;
1693 const SCOPING = 1;
1694 const FORMATTING = 2;
1695 const PHRASING = 3;
1697 const MARKER = 0;
1699 public function __construct()
1701 $this->phase = self::INIT_PHASE;
1702 $this->mode = self::BEFOR_HEAD;
1703 $this->dom = new DOMDocument;
1705 $this->dom->encoding = 'UTF-8';
1706 $this->dom->preserveWhiteSpace = true;
1707 $this->dom->substituteEntities = true;
1708 $this->dom->strictErrorChecking = false;
1711 // Process tag tokens
1712 public function emitToken($token)
1714 switch ($this->phase) {
1715 case self::INIT_PHASE:
1716 return $this->initPhase($token);
1717 break;
1718 case self::ROOT_PHASE:
1719 return $this->rootElementPhase($token);
1720 break;
1721 case self::MAIN_PHASE:
1722 return $this->mainPhase($token);
1723 break;
1724 case self::END_PHASE :
1725 return $this->trailingEndPhase($token);
1726 break;
1730 private function initPhase($token)
1732 /* Initially, the tree construction stage must handle each token
1733 emitted from the tokenisation stage as follows: */
1735 /* A DOCTYPE token that is marked as being in error
1736 A comment token
1737 A start tag token
1738 An end tag token
1739 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1740 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1741 or U+0020 SPACE
1742 An end-of-file token */
1743 if ((isset($token['error']) && $token['error']) ||
1744 $token['type'] === HTML5::COMMENT ||
1745 $token['type'] === HTML5::STARTTAG ||
1746 $token['type'] === HTML5::ENDTAG ||
1747 $token['type'] === HTML5::EOF ||
1748 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1749 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1751 /* This specification does not define how to handle this case. In
1752 particular, user agents may ignore the entirety of this specification
1753 altogether for such documents, and instead invoke special parse modes
1754 with a greater emphasis on backwards compatibility. */
1756 $this->phase = self::ROOT_PHASE;
1757 return $this->rootElementPhase($token);
1759 /* A DOCTYPE token marked as being correct */
1760 } elseif (isset($token['error']) && !$token['error']) {
1761 /* Append a DocumentType node to the Document node, with the name
1762 attribute set to the name given in the DOCTYPE token (which will be
1763 "HTML"), and the other attributes specific to DocumentType objects
1764 set to null, empty lists, or the empty string as appropriate. */
1765 $doctype = new DOMDocumentType(null, null, 'HTML');
1767 /* Then, switch to the root element phase of the tree construction
1768 stage. */
1769 $this->phase = self::ROOT_PHASE;
1771 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1772 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1773 or U+0020 SPACE */
1774 } elseif (isset($token['data']) && preg_match(
1775 '/^[\t\n\x0b\x0c ]+$/',
1776 $token['data']
1779 /* Append that character to the Document node. */
1780 $text = $this->dom->createTextNode($token['data']);
1781 $this->dom->appendChild($text);
1785 private function rootElementPhase($token)
1787 /* After the initial phase, as each token is emitted from the tokenisation
1788 stage, it must be processed as described in this section. */
1790 /* A DOCTYPE token */
1791 if ($token['type'] === HTML5::DOCTYPE) {
1792 // Parse error. Ignore the token.
1794 /* A comment token */
1795 } elseif ($token['type'] === HTML5::COMMENT) {
1796 /* Append a Comment node to the Document object with the data
1797 attribute set to the data given in the comment token. */
1798 $comment = $this->dom->createComment($token['data']);
1799 $this->dom->appendChild($comment);
1801 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1802 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1803 or U+0020 SPACE */
1804 } elseif ($token['type'] === HTML5::CHARACTR &&
1805 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1807 /* Append that character to the Document node. */
1808 $text = $this->dom->createTextNode($token['data']);
1809 $this->dom->appendChild($text);
1811 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1812 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1813 (FF), or U+0020 SPACE
1814 A start tag token
1815 An end tag token
1816 An end-of-file token */
1817 } elseif (($token['type'] === HTML5::CHARACTR &&
1818 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1819 $token['type'] === HTML5::STARTTAG ||
1820 $token['type'] === HTML5::ENDTAG ||
1821 $token['type'] === HTML5::EOF
1823 /* Create an HTMLElement node with the tag name html, in the HTML
1824 namespace. Append it to the Document object. Switch to the main
1825 phase and reprocess the current token. */
1826 $html = $this->dom->createElement('html');
1827 $this->dom->appendChild($html);
1828 $this->stack[] = $html;
1830 $this->phase = self::MAIN_PHASE;
1831 return $this->mainPhase($token);
1835 private function mainPhase($token)
1837 /* Tokens in the main phase must be handled as follows: */
1839 /* A DOCTYPE token */
1840 if ($token['type'] === HTML5::DOCTYPE) {
1841 // Parse error. Ignore the token.
1843 /* A start tag token with the tag name "html" */
1844 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1845 /* If this start tag token was not the first start tag token, then
1846 it is a parse error. */
1848 /* For each attribute on the token, check to see if the attribute
1849 is already present on the top element of the stack of open elements.
1850 If it is not, add the attribute and its corresponding value to that
1851 element. */
1852 foreach ($token['attr'] as $attr) {
1853 if (!$this->stack[0]->hasAttribute($attr['name'])) {
1854 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1858 /* An end-of-file token */
1859 } elseif ($token['type'] === HTML5::EOF) {
1860 /* Generate implied end tags. */
1861 $this->generateImpliedEndTags();
1863 /* Anything else. */
1864 } else {
1865 /* Depends on the insertion mode: */
1866 switch ($this->mode) {
1867 case self::BEFOR_HEAD:
1868 return $this->beforeHead($token);
1869 break;
1870 case self::IN_HEAD:
1871 return $this->inHead($token);
1872 break;
1873 case self::AFTER_HEAD:
1874 return $this->afterHead($token);
1875 break;
1876 case self::IN_BODY:
1877 return $this->inBody($token);
1878 break;
1879 case self::IN_TABLE:
1880 return $this->inTable($token);
1881 break;
1882 case self::IN_CAPTION:
1883 return $this->inCaption($token);
1884 break;
1885 case self::IN_CGROUP:
1886 return $this->inColumnGroup($token);
1887 break;
1888 case self::IN_TBODY:
1889 return $this->inTableBody($token);
1890 break;
1891 case self::IN_ROW:
1892 return $this->inRow($token);
1893 break;
1894 case self::IN_CELL:
1895 return $this->inCell($token);
1896 break;
1897 case self::IN_SELECT:
1898 return $this->inSelect($token);
1899 break;
1900 case self::AFTER_BODY:
1901 return $this->afterBody($token);
1902 break;
1903 case self::IN_FRAME:
1904 return $this->inFrameset($token);
1905 break;
1906 case self::AFTR_FRAME:
1907 return $this->afterFrameset($token);
1908 break;
1909 case self::END_PHASE:
1910 return $this->trailingEndPhase($token);
1911 break;
1916 private function beforeHead($token)
1918 /* Handle the token as follows: */
1920 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1921 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1922 or U+0020 SPACE */
1923 if ($token['type'] === HTML5::CHARACTR &&
1924 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1926 /* Append the character to the current node. */
1927 $this->insertText($token['data']);
1929 /* A comment token */
1930 } elseif ($token['type'] === HTML5::COMMENT) {
1931 /* Append a Comment node to the current node with the data attribute
1932 set to the data given in the comment token. */
1933 $this->insertComment($token['data']);
1935 /* A start tag token with the tag name "head" */
1936 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1937 /* Create an element for the token, append the new element to the
1938 current node and push it onto the stack of open elements. */
1939 $element = $this->insertElement($token);
1941 /* Set the head element pointer to this new element node. */
1942 $this->head_pointer = $element;
1944 /* Change the insertion mode to "in head". */
1945 $this->mode = self::IN_HEAD;
1947 /* A start tag token whose tag name is one of: "base", "link", "meta",
1948 "script", "style", "title". Or an end tag with the tag name "html".
1949 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1950 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1951 or U+0020 SPACE. Or any other start tag token */
1952 } elseif ($token['type'] === HTML5::STARTTAG ||
1953 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1954 ($token['type'] === HTML5::CHARACTR && !preg_match(
1955 '/^[\t\n\x0b\x0c ]$/',
1956 $token['data']
1959 /* Act as if a start tag token with the tag name "head" and no
1960 attributes had been seen, then reprocess the current token. */
1961 $this->beforeHead(
1962 array(
1963 'name' => 'head',
1964 'type' => HTML5::STARTTAG,
1965 'attr' => array()
1969 return $this->inHead($token);
1971 /* Any other end tag */
1972 } elseif ($token['type'] === HTML5::ENDTAG) {
1973 /* Parse error. Ignore the token. */
1977 private function inHead($token)
1979 /* Handle the token as follows: */
1981 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1982 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1983 or U+0020 SPACE.
1985 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1986 or script element, append the character to the current node regardless
1987 of its content. */
1988 if (($token['type'] === HTML5::CHARACTR &&
1989 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1990 $token['type'] === HTML5::CHARACTR && in_array(
1991 end($this->stack)->nodeName,
1992 array('title', 'style', 'script')
1995 /* Append the character to the current node. */
1996 $this->insertText($token['data']);
1998 /* A comment token */
1999 } elseif ($token['type'] === HTML5::COMMENT) {
2000 /* Append a Comment node to the current node with the data attribute
2001 set to the data given in the comment token. */
2002 $this->insertComment($token['data']);
2004 } elseif ($token['type'] === HTML5::ENDTAG &&
2005 in_array($token['name'], array('title', 'style', 'script'))
2007 array_pop($this->stack);
2008 return HTML5::PCDATA;
2010 /* A start tag with the tag name "title" */
2011 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2012 /* Create an element for the token and append the new element to the
2013 node pointed to by the head element pointer, or, if that is null
2014 (innerHTML case), to the current node. */
2015 if ($this->head_pointer !== null) {
2016 $element = $this->insertElement($token, false);
2017 $this->head_pointer->appendChild($element);
2019 } else {
2020 $element = $this->insertElement($token);
2023 /* Switch the tokeniser's content model flag to the RCDATA state. */
2024 return HTML5::RCDATA;
2026 /* A start tag with the tag name "style" */
2027 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2028 /* Create an element for the token and append the new element to the
2029 node pointed to by the head element pointer, or, if that is null
2030 (innerHTML case), to the current node. */
2031 if ($this->head_pointer !== null) {
2032 $element = $this->insertElement($token, false);
2033 $this->head_pointer->appendChild($element);
2035 } else {
2036 $this->insertElement($token);
2039 /* Switch the tokeniser's content model flag to the CDATA state. */
2040 return HTML5::CDATA;
2042 /* A start tag with the tag name "script" */
2043 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2044 /* Create an element for the token. */
2045 $element = $this->insertElement($token, false);
2046 $this->head_pointer->appendChild($element);
2048 /* Switch the tokeniser's content model flag to the CDATA state. */
2049 return HTML5::CDATA;
2051 /* A start tag with the tag name "base", "link", or "meta" */
2052 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2053 $token['name'],
2054 array('base', 'link', 'meta')
2057 /* Create an element for the token and append the new element to the
2058 node pointed to by the head element pointer, or, if that is null
2059 (innerHTML case), to the current node. */
2060 if ($this->head_pointer !== null) {
2061 $element = $this->insertElement($token, false);
2062 $this->head_pointer->appendChild($element);
2063 array_pop($this->stack);
2065 } else {
2066 $this->insertElement($token);
2069 /* An end tag with the tag name "head" */
2070 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2071 /* If the current node is a head element, pop the current node off
2072 the stack of open elements. */
2073 if ($this->head_pointer->isSameNode(end($this->stack))) {
2074 array_pop($this->stack);
2076 /* Otherwise, this is a parse error. */
2077 } else {
2078 // k
2081 /* Change the insertion mode to "after head". */
2082 $this->mode = self::AFTER_HEAD;
2084 /* A start tag with the tag name "head" or an end tag except "html". */
2085 } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2086 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2088 // Parse error. Ignore the token.
2090 /* Anything else */
2091 } else {
2092 /* If the current node is a head element, act as if an end tag
2093 token with the tag name "head" had been seen. */
2094 if ($this->head_pointer->isSameNode(end($this->stack))) {
2095 $this->inHead(
2096 array(
2097 'name' => 'head',
2098 'type' => HTML5::ENDTAG
2102 /* Otherwise, change the insertion mode to "after head". */
2103 } else {
2104 $this->mode = self::AFTER_HEAD;
2107 /* Then, reprocess the current token. */
2108 return $this->afterHead($token);
2112 private function afterHead($token)
2114 /* Handle the token as follows: */
2116 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2117 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2118 or U+0020 SPACE */
2119 if ($token['type'] === HTML5::CHARACTR &&
2120 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2122 /* Append the character to the current node. */
2123 $this->insertText($token['data']);
2125 /* A comment token */
2126 } elseif ($token['type'] === HTML5::COMMENT) {
2127 /* Append a Comment node to the current node with the data attribute
2128 set to the data given in the comment token. */
2129 $this->insertComment($token['data']);
2131 /* A start tag token with the tag name "body" */
2132 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2133 /* Insert a body element for the token. */
2134 $this->insertElement($token);
2136 /* Change the insertion mode to "in body". */
2137 $this->mode = self::IN_BODY;
2139 /* A start tag token with the tag name "frameset" */
2140 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2141 /* Insert a frameset element for the token. */
2142 $this->insertElement($token);
2144 /* Change the insertion mode to "in frameset". */
2145 $this->mode = self::IN_FRAME;
2147 /* A start tag token whose tag name is one of: "base", "link", "meta",
2148 "script", "style", "title" */
2149 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2150 $token['name'],
2151 array('base', 'link', 'meta', 'script', 'style', 'title')
2154 /* Parse error. Switch the insertion mode back to "in head" and
2155 reprocess the token. */
2156 $this->mode = self::IN_HEAD;
2157 return $this->inHead($token);
2159 /* Anything else */
2160 } else {
2161 /* Act as if a start tag token with the tag name "body" and no
2162 attributes had been seen, and then reprocess the current token. */
2163 $this->afterHead(
2164 array(
2165 'name' => 'body',
2166 'type' => HTML5::STARTTAG,
2167 'attr' => array()
2171 return $this->inBody($token);
2175 private function inBody($token)
2177 /* Handle the token as follows: */
2179 switch ($token['type']) {
2180 /* A character token */
2181 case HTML5::CHARACTR:
2182 /* Reconstruct the active formatting elements, if any. */
2183 $this->reconstructActiveFormattingElements();
2185 /* Append the token's character to the current node. */
2186 $this->insertText($token['data']);
2187 break;
2189 /* A comment token */
2190 case HTML5::COMMENT:
2191 /* Append a Comment node to the current node with the data
2192 attribute set to the data given in the comment token. */
2193 $this->insertComment($token['data']);
2194 break;
2196 case HTML5::STARTTAG:
2197 switch ($token['name']) {
2198 /* A start tag token whose tag name is one of: "script",
2199 "style" */
2200 case 'script':
2201 case 'style':
2202 /* Process the token as if the insertion mode had been "in
2203 head". */
2204 return $this->inHead($token);
2205 break;
2207 /* A start tag token whose tag name is one of: "base", "link",
2208 "meta", "title" */
2209 case 'base':
2210 case 'link':
2211 case 'meta':
2212 case 'title':
2213 /* Parse error. Process the token as if the insertion mode
2214 had been "in head". */
2215 return $this->inHead($token);
2216 break;
2218 /* A start tag token with the tag name "body" */
2219 case 'body':
2220 /* Parse error. If the second element on the stack of open
2221 elements is not a body element, or, if the stack of open
2222 elements has only one node on it, then ignore the token.
2223 (innerHTML case) */
2224 if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2225 // Ignore
2227 /* Otherwise, for each attribute on the token, check to see
2228 if the attribute is already present on the body element (the
2229 second element) on the stack of open elements. If it is not,
2230 add the attribute and its corresponding value to that
2231 element. */
2232 } else {
2233 foreach ($token['attr'] as $attr) {
2234 if (!$this->stack[1]->hasAttribute($attr['name'])) {
2235 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2239 break;
2241 /* A start tag whose tag name is one of: "address",
2242 "blockquote", "center", "dir", "div", "dl", "fieldset",
2243 "listing", "menu", "ol", "p", "ul" */
2244 case 'address':
2245 case 'blockquote':
2246 case 'center':
2247 case 'dir':
2248 case 'div':
2249 case 'dl':
2250 case 'fieldset':
2251 case 'listing':
2252 case 'menu':
2253 case 'ol':
2254 case 'p':
2255 case 'ul':
2256 /* If the stack of open elements has a p element in scope,
2257 then act as if an end tag with the tag name p had been
2258 seen. */
2259 if ($this->elementInScope('p')) {
2260 $this->emitToken(
2261 array(
2262 'name' => 'p',
2263 'type' => HTML5::ENDTAG
2268 /* Insert an HTML element for the token. */
2269 $this->insertElement($token);
2270 break;
2272 /* A start tag whose tag name is "form" */
2273 case 'form':
2274 /* If the form element pointer is not null, ignore the
2275 token with a parse error. */
2276 if ($this->form_pointer !== null) {
2277 // Ignore.
2279 /* Otherwise: */
2280 } else {
2281 /* If the stack of open elements has a p element in
2282 scope, then act as if an end tag with the tag name p
2283 had been seen. */
2284 if ($this->elementInScope('p')) {
2285 $this->emitToken(
2286 array(
2287 'name' => 'p',
2288 'type' => HTML5::ENDTAG
2293 /* Insert an HTML element for the token, and set the
2294 form element pointer to point to the element created. */
2295 $element = $this->insertElement($token);
2296 $this->form_pointer = $element;
2298 break;
2300 /* A start tag whose tag name is "li", "dd" or "dt" */
2301 case 'li':
2302 case 'dd':
2303 case 'dt':
2304 /* If the stack of open elements has a p element in scope,
2305 then act as if an end tag with the tag name p had been
2306 seen. */
2307 if ($this->elementInScope('p')) {
2308 $this->emitToken(
2309 array(
2310 'name' => 'p',
2311 'type' => HTML5::ENDTAG
2316 $stack_length = count($this->stack) - 1;
2318 for ($n = $stack_length; 0 <= $n; $n--) {
2319 /* 1. Initialise node to be the current node (the
2320 bottommost node of the stack). */
2321 $stop = false;
2322 $node = $this->stack[$n];
2323 $cat = $this->getElementCategory($node->tagName);
2325 /* 2. If node is an li, dd or dt element, then pop all
2326 the nodes from the current node up to node, including
2327 node, then stop this algorithm. */
2328 if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2329 && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2331 for ($x = $stack_length; $x >= $n; $x--) {
2332 array_pop($this->stack);
2335 break;
2338 /* 3. If node is not in the formatting category, and is
2339 not in the phrasing category, and is not an address or
2340 div element, then stop this algorithm. */
2341 if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2342 $node->tagName !== 'address' && $node->tagName !== 'div'
2344 break;
2348 /* Finally, insert an HTML element with the same tag
2349 name as the token's. */
2350 $this->insertElement($token);
2351 break;
2353 /* A start tag token whose tag name is "plaintext" */
2354 case 'plaintext':
2355 /* If the stack of open elements has a p element in scope,
2356 then act as if an end tag with the tag name p had been
2357 seen. */
2358 if ($this->elementInScope('p')) {
2359 $this->emitToken(
2360 array(
2361 'name' => 'p',
2362 'type' => HTML5::ENDTAG
2367 /* Insert an HTML element for the token. */
2368 $this->insertElement($token);
2370 return HTML5::PLAINTEXT;
2371 break;
2373 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2374 "h5", "h6" */
2375 case 'h1':
2376 case 'h2':
2377 case 'h3':
2378 case 'h4':
2379 case 'h5':
2380 case 'h6':
2381 /* If the stack of open elements has a p element in scope,
2382 then act as if an end tag with the tag name p had been seen. */
2383 if ($this->elementInScope('p')) {
2384 $this->emitToken(
2385 array(
2386 'name' => 'p',
2387 'type' => HTML5::ENDTAG
2392 /* If the stack of open elements has in scope an element whose
2393 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2394 this is a parse error; pop elements from the stack until an
2395 element with one of those tag names has been popped from the
2396 stack. */
2397 while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2398 array_pop($this->stack);
2401 /* Insert an HTML element for the token. */
2402 $this->insertElement($token);
2403 break;
2405 /* A start tag whose tag name is "a" */
2406 case 'a':
2407 /* If the list of active formatting elements contains
2408 an element whose tag name is "a" between the end of the
2409 list and the last marker on the list (or the start of
2410 the list if there is no marker on the list), then this
2411 is a parse error; act as if an end tag with the tag name
2412 "a" had been seen, then remove that element from the list
2413 of active formatting elements and the stack of open
2414 elements if the end tag didn't already remove it (it
2415 might not have if the element is not in table scope). */
2416 $leng = count($this->a_formatting);
2418 for ($n = $leng - 1; $n >= 0; $n--) {
2419 if ($this->a_formatting[$n] === self::MARKER) {
2420 break;
2422 } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2423 $this->emitToken(
2424 array(
2425 'name' => 'a',
2426 'type' => HTML5::ENDTAG
2429 break;
2433 /* Reconstruct the active formatting elements, if any. */
2434 $this->reconstructActiveFormattingElements();
2436 /* Insert an HTML element for the token. */
2437 $el = $this->insertElement($token);
2439 /* Add that element to the list of active formatting
2440 elements. */
2441 $this->a_formatting[] = $el;
2442 break;
2444 /* A start tag whose tag name is one of: "b", "big", "em", "font",
2445 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2446 case 'b':
2447 case 'big':
2448 case 'em':
2449 case 'font':
2450 case 'i':
2451 case 'nobr':
2452 case 's':
2453 case 'small':
2454 case 'strike':
2455 case 'strong':
2456 case 'tt':
2457 case 'u':
2458 /* Reconstruct the active formatting elements, if any. */
2459 $this->reconstructActiveFormattingElements();
2461 /* Insert an HTML element for the token. */
2462 $el = $this->insertElement($token);
2464 /* Add that element to the list of active formatting
2465 elements. */
2466 $this->a_formatting[] = $el;
2467 break;
2469 /* A start tag token whose tag name is "button" */
2470 case 'button':
2471 /* If the stack of open elements has a button element in scope,
2472 then this is a parse error; act as if an end tag with the tag
2473 name "button" had been seen, then reprocess the token. (We don't
2474 do that. Unnecessary.) */
2475 if ($this->elementInScope('button')) {
2476 $this->inBody(
2477 array(
2478 'name' => 'button',
2479 'type' => HTML5::ENDTAG
2484 /* Reconstruct the active formatting elements, if any. */
2485 $this->reconstructActiveFormattingElements();
2487 /* Insert an HTML element for the token. */
2488 $this->insertElement($token);
2490 /* Insert a marker at the end of the list of active
2491 formatting elements. */
2492 $this->a_formatting[] = self::MARKER;
2493 break;
2495 /* A start tag token whose tag name is one of: "marquee", "object" */
2496 case 'marquee':
2497 case 'object':
2498 /* Reconstruct the active formatting elements, if any. */
2499 $this->reconstructActiveFormattingElements();
2501 /* Insert an HTML element for the token. */
2502 $this->insertElement($token);
2504 /* Insert a marker at the end of the list of active
2505 formatting elements. */
2506 $this->a_formatting[] = self::MARKER;
2507 break;
2509 /* A start tag token whose tag name is "xmp" */
2510 case 'xmp':
2511 /* Reconstruct the active formatting elements, if any. */
2512 $this->reconstructActiveFormattingElements();
2514 /* Insert an HTML element for the token. */
2515 $this->insertElement($token);
2517 /* Switch the content model flag to the CDATA state. */
2518 return HTML5::CDATA;
2519 break;
2521 /* A start tag whose tag name is "table" */
2522 case 'table':
2523 /* If the stack of open elements has a p element in scope,
2524 then act as if an end tag with the tag name p had been seen. */
2525 if ($this->elementInScope('p')) {
2526 $this->emitToken(
2527 array(
2528 'name' => 'p',
2529 'type' => HTML5::ENDTAG
2534 /* Insert an HTML element for the token. */
2535 $this->insertElement($token);
2537 /* Change the insertion mode to "in table". */
2538 $this->mode = self::IN_TABLE;
2539 break;
2541 /* A start tag whose tag name is one of: "area", "basefont",
2542 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2543 case 'area':
2544 case 'basefont':
2545 case 'bgsound':
2546 case 'br':
2547 case 'embed':
2548 case 'img':
2549 case 'param':
2550 case 'spacer':
2551 case 'wbr':
2552 /* Reconstruct the active formatting elements, if any. */
2553 $this->reconstructActiveFormattingElements();
2555 /* Insert an HTML element for the token. */
2556 $this->insertElement($token);
2558 /* Immediately pop the current node off the stack of open elements. */
2559 array_pop($this->stack);
2560 break;
2562 /* A start tag whose tag name is "hr" */
2563 case 'hr':
2564 /* If the stack of open elements has a p element in scope,
2565 then act as if an end tag with the tag name p had been seen. */
2566 if ($this->elementInScope('p')) {
2567 $this->emitToken(
2568 array(
2569 'name' => 'p',
2570 'type' => HTML5::ENDTAG
2575 /* Insert an HTML element for the token. */
2576 $this->insertElement($token);
2578 /* Immediately pop the current node off the stack of open elements. */
2579 array_pop($this->stack);
2580 break;
2582 /* A start tag whose tag name is "image" */
2583 case 'image':
2584 /* Parse error. Change the token's tag name to "img" and
2585 reprocess it. (Don't ask.) */
2586 $token['name'] = 'img';
2587 return $this->inBody($token);
2588 break;
2590 /* A start tag whose tag name is "input" */
2591 case 'input':
2592 /* Reconstruct the active formatting elements, if any. */
2593 $this->reconstructActiveFormattingElements();
2595 /* Insert an input element for the token. */
2596 $element = $this->insertElement($token, false);
2598 /* If the form element pointer is not null, then associate the
2599 input element with the form element pointed to by the form
2600 element pointer. */
2601 $this->form_pointer !== null
2602 ? $this->form_pointer->appendChild($element)
2603 : end($this->stack)->appendChild($element);
2605 /* Pop that input element off the stack of open elements. */
2606 array_pop($this->stack);
2607 break;
2609 /* A start tag whose tag name is "isindex" */
2610 case 'isindex':
2611 /* Parse error. */
2612 // w/e
2614 /* If the form element pointer is not null,
2615 then ignore the token. */
2616 if ($this->form_pointer === null) {
2617 /* Act as if a start tag token with the tag name "form" had
2618 been seen. */
2619 $this->inBody(
2620 array(
2621 'name' => 'body',
2622 'type' => HTML5::STARTTAG,
2623 'attr' => array()
2627 /* Act as if a start tag token with the tag name "hr" had
2628 been seen. */
2629 $this->inBody(
2630 array(
2631 'name' => 'hr',
2632 'type' => HTML5::STARTTAG,
2633 'attr' => array()
2637 /* Act as if a start tag token with the tag name "p" had
2638 been seen. */
2639 $this->inBody(
2640 array(
2641 'name' => 'p',
2642 'type' => HTML5::STARTTAG,
2643 'attr' => array()
2647 /* Act as if a start tag token with the tag name "label"
2648 had been seen. */
2649 $this->inBody(
2650 array(
2651 'name' => 'label',
2652 'type' => HTML5::STARTTAG,
2653 'attr' => array()
2657 /* Act as if a stream of character tokens had been seen. */
2658 $this->insertText(
2659 'This is a searchable index. ' .
2660 'Insert your search keywords here: '
2663 /* Act as if a start tag token with the tag name "input"
2664 had been seen, with all the attributes from the "isindex"
2665 token, except with the "name" attribute set to the value
2666 "isindex" (ignoring any explicit "name" attribute). */
2667 $attr = $token['attr'];
2668 $attr[] = array('name' => 'name', 'value' => 'isindex');
2670 $this->inBody(
2671 array(
2672 'name' => 'input',
2673 'type' => HTML5::STARTTAG,
2674 'attr' => $attr
2678 /* Act as if a stream of character tokens had been seen
2679 (see below for what they should say). */
2680 $this->insertText(
2681 'This is a searchable index. ' .
2682 'Insert your search keywords here: '
2685 /* Act as if an end tag token with the tag name "label"
2686 had been seen. */
2687 $this->inBody(
2688 array(
2689 'name' => 'label',
2690 'type' => HTML5::ENDTAG
2694 /* Act as if an end tag token with the tag name "p" had
2695 been seen. */
2696 $this->inBody(
2697 array(
2698 'name' => 'p',
2699 'type' => HTML5::ENDTAG
2703 /* Act as if a start tag token with the tag name "hr" had
2704 been seen. */
2705 $this->inBody(
2706 array(
2707 'name' => 'hr',
2708 'type' => HTML5::ENDTAG
2712 /* Act as if an end tag token with the tag name "form" had
2713 been seen. */
2714 $this->inBody(
2715 array(
2716 'name' => 'form',
2717 'type' => HTML5::ENDTAG
2721 break;
2723 /* A start tag whose tag name is "textarea" */
2724 case 'textarea':
2725 $this->insertElement($token);
2727 /* Switch the tokeniser's content model flag to the
2728 RCDATA state. */
2729 return HTML5::RCDATA;
2730 break;
2732 /* A start tag whose tag name is one of: "iframe", "noembed",
2733 "noframes" */
2734 case 'iframe':
2735 case 'noembed':
2736 case 'noframes':
2737 $this->insertElement($token);
2739 /* Switch the tokeniser's content model flag to the CDATA state. */
2740 return HTML5::CDATA;
2741 break;
2743 /* A start tag whose tag name is "select" */
2744 case 'select':
2745 /* Reconstruct the active formatting elements, if any. */
2746 $this->reconstructActiveFormattingElements();
2748 /* Insert an HTML element for the token. */
2749 $this->insertElement($token);
2751 /* Change the insertion mode to "in select". */
2752 $this->mode = self::IN_SELECT;
2753 break;
2755 /* A start or end tag whose tag name is one of: "caption", "col",
2756 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2757 "tbody", "td", "tfoot", "th", "thead", "tr". */
2758 case 'caption':
2759 case 'col':
2760 case 'colgroup':
2761 case 'frame':
2762 case 'frameset':
2763 case 'head':
2764 case 'option':
2765 case 'optgroup':
2766 case 'tbody':
2767 case 'td':
2768 case 'tfoot':
2769 case 'th':
2770 case 'thead':
2771 case 'tr':
2772 // Parse error. Ignore the token.
2773 break;
2775 /* A start or end tag whose tag name is one of: "event-source",
2776 "section", "nav", "article", "aside", "header", "footer",
2777 "datagrid", "command" */
2778 case 'event-source':
2779 case 'section':
2780 case 'nav':
2781 case 'article':
2782 case 'aside':
2783 case 'header':
2784 case 'footer':
2785 case 'datagrid':
2786 case 'command':
2787 // Work in progress!
2788 break;
2790 /* A start tag token not covered by the previous entries */
2791 default:
2792 /* Reconstruct the active formatting elements, if any. */
2793 $this->reconstructActiveFormattingElements();
2795 $this->insertElement($token, true, true);
2796 break;
2798 break;
2800 case HTML5::ENDTAG:
2801 switch ($token['name']) {
2802 /* An end tag with the tag name "body" */
2803 case 'body':
2804 /* If the second element in the stack of open elements is
2805 not a body element, this is a parse error. Ignore the token.
2806 (innerHTML case) */
2807 if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2808 // Ignore.
2810 /* If the current node is not the body element, then this
2811 is a parse error. */
2812 } elseif (end($this->stack)->nodeName !== 'body') {
2813 // Parse error.
2816 /* Change the insertion mode to "after body". */
2817 $this->mode = self::AFTER_BODY;
2818 break;
2820 /* An end tag with the tag name "html" */
2821 case 'html':
2822 /* Act as if an end tag with tag name "body" had been seen,
2823 then, if that token wasn't ignored, reprocess the current
2824 token. */
2825 $this->inBody(
2826 array(
2827 'name' => 'body',
2828 'type' => HTML5::ENDTAG
2832 return $this->afterBody($token);
2833 break;
2835 /* An end tag whose tag name is one of: "address", "blockquote",
2836 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2837 "ol", "pre", "ul" */
2838 case 'address':
2839 case 'blockquote':
2840 case 'center':
2841 case 'dir':
2842 case 'div':
2843 case 'dl':
2844 case 'fieldset':
2845 case 'listing':
2846 case 'menu':
2847 case 'ol':
2848 case 'pre':
2849 case 'ul':
2850 /* If the stack of open elements has an element in scope
2851 with the same tag name as that of the token, then generate
2852 implied end tags. */
2853 if ($this->elementInScope($token['name'])) {
2854 $this->generateImpliedEndTags();
2856 /* Now, if the current node is not an element with
2857 the same tag name as that of the token, then this
2858 is a parse error. */
2859 // w/e
2861 /* If the stack of open elements has an element in
2862 scope with the same tag name as that of the token,
2863 then pop elements from this stack until an element
2864 with that tag name has been popped from the stack. */
2865 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2866 if ($this->stack[$n]->nodeName === $token['name']) {
2867 $n = -1;
2870 array_pop($this->stack);
2873 break;
2875 /* An end tag whose tag name is "form" */
2876 case 'form':
2877 /* If the stack of open elements has an element in scope
2878 with the same tag name as that of the token, then generate
2879 implied end tags. */
2880 if ($this->elementInScope($token['name'])) {
2881 $this->generateImpliedEndTags();
2885 if (end($this->stack)->nodeName !== $token['name']) {
2886 /* Now, if the current node is not an element with the
2887 same tag name as that of the token, then this is a parse
2888 error. */
2889 // w/e
2891 } else {
2892 /* Otherwise, if the current node is an element with
2893 the same tag name as that of the token pop that element
2894 from the stack. */
2895 array_pop($this->stack);
2898 /* In any case, set the form element pointer to null. */
2899 $this->form_pointer = null;
2900 break;
2902 /* An end tag whose tag name is "p" */
2903 case 'p':
2904 /* If the stack of open elements has a p element in scope,
2905 then generate implied end tags, except for p elements. */
2906 if ($this->elementInScope('p')) {
2907 $this->generateImpliedEndTags(array('p'));
2909 /* If the current node is not a p element, then this is
2910 a parse error. */
2911 // k
2913 /* If the stack of open elements has a p element in
2914 scope, then pop elements from this stack until the stack
2915 no longer has a p element in scope. */
2916 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2917 if ($this->elementInScope('p')) {
2918 array_pop($this->stack);
2920 } else {
2921 break;
2925 break;
2927 /* An end tag whose tag name is "dd", "dt", or "li" */
2928 case 'dd':
2929 case 'dt':
2930 case 'li':
2931 /* If the stack of open elements has an element in scope
2932 whose tag name matches the tag name of the token, then
2933 generate implied end tags, except for elements with the
2934 same tag name as the token. */
2935 if ($this->elementInScope($token['name'])) {
2936 $this->generateImpliedEndTags(array($token['name']));
2938 /* If the current node is not an element with the same
2939 tag name as the token, then this is a parse error. */
2940 // w/e
2942 /* If the stack of open elements has an element in scope
2943 whose tag name matches the tag name of the token, then
2944 pop elements from this stack until an element with that
2945 tag name has been popped from the stack. */
2946 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2947 if ($this->stack[$n]->nodeName === $token['name']) {
2948 $n = -1;
2951 array_pop($this->stack);
2954 break;
2956 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2957 "h5", "h6" */
2958 case 'h1':
2959 case 'h2':
2960 case 'h3':
2961 case 'h4':
2962 case 'h5':
2963 case 'h6':
2964 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2966 /* If the stack of open elements has in scope an element whose
2967 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2968 generate implied end tags. */
2969 if ($this->elementInScope($elements)) {
2970 $this->generateImpliedEndTags();
2972 /* Now, if the current node is not an element with the same
2973 tag name as that of the token, then this is a parse error. */
2974 // w/e
2976 /* If the stack of open elements has in scope an element
2977 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2978 "h6", then pop elements from the stack until an element
2979 with one of those tag names has been popped from the stack. */
2980 while ($this->elementInScope($elements)) {
2981 array_pop($this->stack);
2984 break;
2986 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2987 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2988 case 'a':
2989 case 'b':
2990 case 'big':
2991 case 'em':
2992 case 'font':
2993 case 'i':
2994 case 'nobr':
2995 case 's':
2996 case 'small':
2997 case 'strike':
2998 case 'strong':
2999 case 'tt':
3000 case 'u':
3001 /* 1. Let the formatting element be the last element in
3002 the list of active formatting elements that:
3003 * is between the end of the list and the last scope
3004 marker in the list, if any, or the start of the list
3005 otherwise, and
3006 * has the same tag name as the token.
3008 while (true) {
3009 for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3010 if ($this->a_formatting[$a] === self::MARKER) {
3011 break;
3013 } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3014 $formatting_element = $this->a_formatting[$a];
3015 $in_stack = in_array($formatting_element, $this->stack, true);
3016 $fe_af_pos = $a;
3017 break;
3021 /* If there is no such node, or, if that node is
3022 also in the stack of open elements but the element
3023 is not in scope, then this is a parse error. Abort
3024 these steps. The token is ignored. */
3025 if (!isset($formatting_element) || ($in_stack &&
3026 !$this->elementInScope($token['name']))
3028 break;
3030 /* Otherwise, if there is such a node, but that node
3031 is not in the stack of open elements, then this is a
3032 parse error; remove the element from the list, and
3033 abort these steps. */
3034 } elseif (isset($formatting_element) && !$in_stack) {
3035 unset($this->a_formatting[$fe_af_pos]);
3036 $this->a_formatting = array_merge($this->a_formatting);
3037 break;
3040 /* 2. Let the furthest block be the topmost node in the
3041 stack of open elements that is lower in the stack
3042 than the formatting element, and is not an element in
3043 the phrasing or formatting categories. There might
3044 not be one. */
3045 $fe_s_pos = array_search($formatting_element, $this->stack, true);
3046 $length = count($this->stack);
3048 for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3049 $category = $this->getElementCategory($this->stack[$s]->nodeName);
3051 if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3052 $furthest_block = $this->stack[$s];
3056 /* 3. If there is no furthest block, then the UA must
3057 skip the subsequent steps and instead just pop all
3058 the nodes from the bottom of the stack of open
3059 elements, from the current node up to the formatting
3060 element, and remove the formatting element from the
3061 list of active formatting elements. */
3062 if (!isset($furthest_block)) {
3063 for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3064 array_pop($this->stack);
3067 unset($this->a_formatting[$fe_af_pos]);
3068 $this->a_formatting = array_merge($this->a_formatting);
3069 break;
3072 /* 4. Let the common ancestor be the element
3073 immediately above the formatting element in the stack
3074 of open elements. */
3075 $common_ancestor = $this->stack[$fe_s_pos - 1];
3077 /* 5. If the furthest block has a parent node, then
3078 remove the furthest block from its parent node. */
3079 if ($furthest_block->parentNode !== null) {
3080 $furthest_block->parentNode->removeChild($furthest_block);
3083 /* 6. Let a bookmark note the position of the
3084 formatting element in the list of active formatting
3085 elements relative to the elements on either side
3086 of it in the list. */
3087 $bookmark = $fe_af_pos;
3089 /* 7. Let node and last node be the furthest block.
3090 Follow these steps: */
3091 $node = $furthest_block;
3092 $last_node = $furthest_block;
3094 while (true) {
3095 for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3096 /* 7.1 Let node be the element immediately
3097 prior to node in the stack of open elements. */
3098 $node = $this->stack[$n];
3100 /* 7.2 If node is not in the list of active
3101 formatting elements, then remove node from
3102 the stack of open elements and then go back
3103 to step 1. */
3104 if (!in_array($node, $this->a_formatting, true)) {
3105 unset($this->stack[$n]);
3106 $this->stack = array_merge($this->stack);
3108 } else {
3109 break;
3113 /* 7.3 Otherwise, if node is the formatting
3114 element, then go to the next step in the overall
3115 algorithm. */
3116 if ($node === $formatting_element) {
3117 break;
3119 /* 7.4 Otherwise, if last node is the furthest
3120 block, then move the aforementioned bookmark to
3121 be immediately after the node in the list of
3122 active formatting elements. */
3123 } elseif ($last_node === $furthest_block) {
3124 $bookmark = array_search($node, $this->a_formatting, true) + 1;
3127 /* 7.5 If node has any children, perform a
3128 shallow clone of node, replace the entry for
3129 node in the list of active formatting elements
3130 with an entry for the clone, replace the entry
3131 for node in the stack of open elements with an
3132 entry for the clone, and let node be the clone. */
3133 if ($node->hasChildNodes()) {
3134 $clone = $node->cloneNode();
3135 $s_pos = array_search($node, $this->stack, true);
3136 $a_pos = array_search($node, $this->a_formatting, true);
3138 $this->stack[$s_pos] = $clone;
3139 $this->a_formatting[$a_pos] = $clone;
3140 $node = $clone;
3143 /* 7.6 Insert last node into node, first removing
3144 it from its previous parent node if any. */
3145 if ($last_node->parentNode !== null) {
3146 $last_node->parentNode->removeChild($last_node);
3149 $node->appendChild($last_node);
3151 /* 7.7 Let last node be node. */
3152 $last_node = $node;
3155 /* 8. Insert whatever last node ended up being in
3156 the previous step into the common ancestor node,
3157 first removing it from its previous parent node if
3158 any. */
3159 if ($last_node->parentNode !== null) {
3160 $last_node->parentNode->removeChild($last_node);
3163 $common_ancestor->appendChild($last_node);
3165 /* 9. Perform a shallow clone of the formatting
3166 element. */
3167 $clone = $formatting_element->cloneNode();
3169 /* 10. Take all of the child nodes of the furthest
3170 block and append them to the clone created in the
3171 last step. */
3172 while ($furthest_block->hasChildNodes()) {
3173 $child = $furthest_block->firstChild;
3174 $furthest_block->removeChild($child);
3175 $clone->appendChild($child);
3178 /* 11. Append that clone to the furthest block. */
3179 $furthest_block->appendChild($clone);
3181 /* 12. Remove the formatting element from the list
3182 of active formatting elements, and insert the clone
3183 into the list of active formatting elements at the
3184 position of the aforementioned bookmark. */
3185 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3186 unset($this->a_formatting[$fe_af_pos]);
3187 $this->a_formatting = array_merge($this->a_formatting);
3189 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3190 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3191 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3193 /* 13. Remove the formatting element from the stack
3194 of open elements, and insert the clone into the stack
3195 of open elements immediately after (i.e. in a more
3196 deeply nested position than) the position of the
3197 furthest block in that stack. */
3198 $fe_s_pos = array_search($formatting_element, $this->stack, true);
3199 $fb_s_pos = array_search($furthest_block, $this->stack, true);
3200 unset($this->stack[$fe_s_pos]);
3202 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3203 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3204 $this->stack = array_merge($s_part1, array($clone), $s_part2);
3206 /* 14. Jump back to step 1 in this series of steps. */
3207 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3209 break;
3211 /* An end tag token whose tag name is one of: "button",
3212 "marquee", "object" */
3213 case 'button':
3214 case 'marquee':
3215 case 'object':
3216 /* If the stack of open elements has an element in scope whose
3217 tag name matches the tag name of the token, then generate implied
3218 tags. */
3219 if ($this->elementInScope($token['name'])) {
3220 $this->generateImpliedEndTags();
3222 /* Now, if the current node is not an element with the same
3223 tag name as the token, then this is a parse error. */
3224 // k
3226 /* Now, if the stack of open elements has an element in scope
3227 whose tag name matches the tag name of the token, then pop
3228 elements from the stack until that element has been popped from
3229 the stack, and clear the list of active formatting elements up
3230 to the last marker. */
3231 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3232 if ($this->stack[$n]->nodeName === $token['name']) {
3233 $n = -1;
3236 array_pop($this->stack);
3239 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3241 for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3242 array_pop($this->a_formatting);
3245 break;
3247 /* Or an end tag whose tag name is one of: "area", "basefont",
3248 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3249 "input", "isindex", "noembed", "noframes", "param", "select",
3250 "spacer", "table", "textarea", "wbr" */
3251 case 'area':
3252 case 'basefont':
3253 case 'bgsound':
3254 case 'br':
3255 case 'embed':
3256 case 'hr':
3257 case 'iframe':
3258 case 'image':
3259 case 'img':
3260 case 'input':
3261 case 'isindex':
3262 case 'noembed':
3263 case 'noframes':
3264 case 'param':
3265 case 'select':
3266 case 'spacer':
3267 case 'table':
3268 case 'textarea':
3269 case 'wbr':
3270 // Parse error. Ignore the token.
3271 break;
3273 /* An end tag token not covered by the previous entries */
3274 default:
3275 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3276 /* Initialise node to be the current node (the bottommost
3277 node of the stack). */
3278 $node = end($this->stack);
3280 /* If node has the same tag name as the end tag token,
3281 then: */
3282 if ($token['name'] === $node->nodeName) {
3283 /* Generate implied end tags. */
3284 $this->generateImpliedEndTags();
3286 /* If the tag name of the end tag token does not
3287 match the tag name of the current node, this is a
3288 parse error. */
3289 // k
3291 /* Pop all the nodes from the current node up to
3292 node, including node, then stop this algorithm. */
3293 for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3294 array_pop($this->stack);
3297 } else {
3298 $category = $this->getElementCategory($node);
3300 if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3301 /* Otherwise, if node is in neither the formatting
3302 category nor the phrasing category, then this is a
3303 parse error. Stop this algorithm. The end tag token
3304 is ignored. */
3305 return false;
3309 break;
3311 break;
3315 private function inTable($token)
3317 $clear = array('html', 'table');
3319 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3320 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3321 or U+0020 SPACE */
3322 if ($token['type'] === HTML5::CHARACTR &&
3323 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3325 /* Append the character to the current node. */
3326 $text = $this->dom->createTextNode($token['data']);
3327 end($this->stack)->appendChild($text);
3329 /* A comment token */
3330 } elseif ($token['type'] === HTML5::COMMENT) {
3331 /* Append a Comment node to the current node with the data
3332 attribute set to the data given in the comment token. */
3333 $comment = $this->dom->createComment($token['data']);
3334 end($this->stack)->appendChild($comment);
3336 /* A start tag whose tag name is "caption" */
3337 } elseif ($token['type'] === HTML5::STARTTAG &&
3338 $token['name'] === 'caption'
3340 /* Clear the stack back to a table context. */
3341 $this->clearStackToTableContext($clear);
3343 /* Insert a marker at the end of the list of active
3344 formatting elements. */
3345 $this->a_formatting[] = self::MARKER;
3347 /* Insert an HTML element for the token, then switch the
3348 insertion mode to "in caption". */
3349 $this->insertElement($token);
3350 $this->mode = self::IN_CAPTION;
3352 /* A start tag whose tag name is "colgroup" */
3353 } elseif ($token['type'] === HTML5::STARTTAG &&
3354 $token['name'] === 'colgroup'
3356 /* Clear the stack back to a table context. */
3357 $this->clearStackToTableContext($clear);
3359 /* Insert an HTML element for the token, then switch the
3360 insertion mode to "in column group". */
3361 $this->insertElement($token);
3362 $this->mode = self::IN_CGROUP;
3364 /* A start tag whose tag name is "col" */
3365 } elseif ($token['type'] === HTML5::STARTTAG &&
3366 $token['name'] === 'col'
3368 $this->inTable(
3369 array(
3370 'name' => 'colgroup',
3371 'type' => HTML5::STARTTAG,
3372 'attr' => array()
3376 $this->inColumnGroup($token);
3378 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3379 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3380 $token['name'],
3381 array('tbody', 'tfoot', 'thead')
3384 /* Clear the stack back to a table context. */
3385 $this->clearStackToTableContext($clear);
3387 /* Insert an HTML element for the token, then switch the insertion
3388 mode to "in table body". */
3389 $this->insertElement($token);
3390 $this->mode = self::IN_TBODY;
3392 /* A start tag whose tag name is one of: "td", "th", "tr" */
3393 } elseif ($token['type'] === HTML5::STARTTAG &&
3394 in_array($token['name'], array('td', 'th', 'tr'))
3396 /* Act as if a start tag token with the tag name "tbody" had been
3397 seen, then reprocess the current token. */
3398 $this->inTable(
3399 array(
3400 'name' => 'tbody',
3401 'type' => HTML5::STARTTAG,
3402 'attr' => array()
3406 return $this->inTableBody($token);
3408 /* A start tag whose tag name is "table" */
3409 } elseif ($token['type'] === HTML5::STARTTAG &&
3410 $token['name'] === 'table'
3412 /* Parse error. Act as if an end tag token with the tag name "table"
3413 had been seen, then, if that token wasn't ignored, reprocess the
3414 current token. */
3415 $this->inTable(
3416 array(
3417 'name' => 'table',
3418 'type' => HTML5::ENDTAG
3422 return $this->mainPhase($token);
3424 /* An end tag whose tag name is "table" */
3425 } elseif ($token['type'] === HTML5::ENDTAG &&
3426 $token['name'] === 'table'
3428 /* If the stack of open elements does not have an element in table
3429 scope with the same tag name as the token, this is a parse error.
3430 Ignore the token. (innerHTML case) */
3431 if (!$this->elementInScope($token['name'], true)) {
3432 return false;
3434 /* Otherwise: */
3435 } else {
3436 /* Generate implied end tags. */
3437 $this->generateImpliedEndTags();
3439 /* Now, if the current node is not a table element, then this
3440 is a parse error. */
3441 // w/e
3443 /* Pop elements from this stack until a table element has been
3444 popped from the stack. */
3445 while (true) {
3446 $current = end($this->stack)->nodeName;
3447 array_pop($this->stack);
3449 if ($current === 'table') {
3450 break;
3454 /* Reset the insertion mode appropriately. */
3455 $this->resetInsertionMode();
3458 /* An end tag whose tag name is one of: "body", "caption", "col",
3459 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3460 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3461 $token['name'],
3462 array(
3463 'body',
3464 'caption',
3465 'col',
3466 'colgroup',
3467 'html',
3468 'tbody',
3469 'td',
3470 'tfoot',
3471 'th',
3472 'thead',
3473 'tr'
3477 // Parse error. Ignore the token.
3479 /* Anything else */
3480 } else {
3481 /* Parse error. Process the token as if the insertion mode was "in
3482 body", with the following exception: */
3484 /* If the current node is a table, tbody, tfoot, thead, or tr
3485 element, then, whenever a node would be inserted into the current
3486 node, it must instead be inserted into the foster parent element. */
3487 if (in_array(
3488 end($this->stack)->nodeName,
3489 array('table', 'tbody', 'tfoot', 'thead', 'tr')
3492 /* The foster parent element is the parent element of the last
3493 table element in the stack of open elements, if there is a
3494 table element and it has such a parent element. If there is no
3495 table element in the stack of open elements (innerHTML case),
3496 then the foster parent element is the first element in the
3497 stack of open elements (the html element). Otherwise, if there
3498 is a table element in the stack of open elements, but the last
3499 table element in the stack of open elements has no parent, or
3500 its parent node is not an element, then the foster parent
3501 element is the element before the last table element in the
3502 stack of open elements. */
3503 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3504 if ($this->stack[$n]->nodeName === 'table') {
3505 $table = $this->stack[$n];
3506 break;
3510 if (isset($table) && $table->parentNode !== null) {
3511 $this->foster_parent = $table->parentNode;
3513 } elseif (!isset($table)) {
3514 $this->foster_parent = $this->stack[0];
3516 } elseif (isset($table) && ($table->parentNode === null ||
3517 $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3519 $this->foster_parent = $this->stack[$n - 1];
3523 $this->inBody($token);
3527 private function inCaption($token)
3529 /* An end tag whose tag name is "caption" */
3530 if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3531 /* If the stack of open elements does not have an element in table
3532 scope with the same tag name as the token, this is a parse error.
3533 Ignore the token. (innerHTML case) */
3534 if (!$this->elementInScope($token['name'], true)) {
3535 // Ignore
3537 /* Otherwise: */
3538 } else {
3539 /* Generate implied end tags. */
3540 $this->generateImpliedEndTags();
3542 /* Now, if the current node is not a caption element, then this
3543 is a parse error. */
3544 // w/e
3546 /* Pop elements from this stack until a caption element has
3547 been popped from the stack. */
3548 while (true) {
3549 $node = end($this->stack)->nodeName;
3550 array_pop($this->stack);
3552 if ($node === 'caption') {
3553 break;
3557 /* Clear the list of active formatting elements up to the last
3558 marker. */
3559 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3561 /* Switch the insertion mode to "in table". */
3562 $this->mode = self::IN_TABLE;
3565 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3566 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3567 name is "table" */
3568 } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3569 $token['name'],
3570 array(
3571 'caption',
3572 'col',
3573 'colgroup',
3574 'tbody',
3575 'td',
3576 'tfoot',
3577 'th',
3578 'thead',
3579 'tr'
3581 )) || ($token['type'] === HTML5::ENDTAG &&
3582 $token['name'] === 'table')
3584 /* Parse error. Act as if an end tag with the tag name "caption"
3585 had been seen, then, if that token wasn't ignored, reprocess the
3586 current token. */
3587 $this->inCaption(
3588 array(
3589 'name' => 'caption',
3590 'type' => HTML5::ENDTAG
3594 return $this->inTable($token);
3596 /* An end tag whose tag name is one of: "body", "col", "colgroup",
3597 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3598 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3599 $token['name'],
3600 array(
3601 'body',
3602 'col',
3603 'colgroup',
3604 'html',
3605 'tbody',
3606 'tfoot',
3607 'th',
3608 'thead',
3609 'tr'
3613 // Parse error. Ignore the token.
3615 /* Anything else */
3616 } else {
3617 /* Process the token as if the insertion mode was "in body". */
3618 $this->inBody($token);
3622 private function inColumnGroup($token)
3624 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3625 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3626 or U+0020 SPACE */
3627 if ($token['type'] === HTML5::CHARACTR &&
3628 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3630 /* Append the character to the current node. */
3631 $text = $this->dom->createTextNode($token['data']);
3632 end($this->stack)->appendChild($text);
3634 /* A comment token */
3635 } elseif ($token['type'] === HTML5::COMMENT) {
3636 /* Append a Comment node to the current node with the data
3637 attribute set to the data given in the comment token. */
3638 $comment = $this->dom->createComment($token['data']);
3639 end($this->stack)->appendChild($comment);
3641 /* A start tag whose tag name is "col" */
3642 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3643 /* Insert a col element for the token. Immediately pop the current
3644 node off the stack of open elements. */
3645 $this->insertElement($token);
3646 array_pop($this->stack);
3648 /* An end tag whose tag name is "colgroup" */
3649 } elseif ($token['type'] === HTML5::ENDTAG &&
3650 $token['name'] === 'colgroup'
3652 /* If the current node is the root html element, then this is a
3653 parse error, ignore the token. (innerHTML case) */
3654 if (end($this->stack)->nodeName === 'html') {
3655 // Ignore
3657 /* Otherwise, pop the current node (which will be a colgroup
3658 element) from the stack of open elements. Switch the insertion
3659 mode to "in table". */
3660 } else {
3661 array_pop($this->stack);
3662 $this->mode = self::IN_TABLE;
3665 /* An end tag whose tag name is "col" */
3666 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3667 /* Parse error. Ignore the token. */
3669 /* Anything else */
3670 } else {
3671 /* Act as if an end tag with the tag name "colgroup" had been seen,
3672 and then, if that token wasn't ignored, reprocess the current token. */
3673 $this->inColumnGroup(
3674 array(
3675 'name' => 'colgroup',
3676 'type' => HTML5::ENDTAG
3680 return $this->inTable($token);
3684 private function inTableBody($token)
3686 $clear = array('tbody', 'tfoot', 'thead', 'html');
3688 /* A start tag whose tag name is "tr" */
3689 if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3690 /* Clear the stack back to a table body context. */
3691 $this->clearStackToTableContext($clear);
3693 /* Insert a tr element for the token, then switch the insertion
3694 mode to "in row". */
3695 $this->insertElement($token);
3696 $this->mode = self::IN_ROW;
3698 /* A start tag whose tag name is one of: "th", "td" */
3699 } elseif ($token['type'] === HTML5::STARTTAG &&
3700 ($token['name'] === 'th' || $token['name'] === 'td')
3702 /* Parse error. Act as if a start tag with the tag name "tr" had
3703 been seen, then reprocess the current token. */
3704 $this->inTableBody(
3705 array(
3706 'name' => 'tr',
3707 'type' => HTML5::STARTTAG,
3708 'attr' => array()
3712 return $this->inRow($token);
3714 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3715 } elseif ($token['type'] === HTML5::ENDTAG &&
3716 in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3718 /* If the stack of open elements does not have an element in table
3719 scope with the same tag name as the token, this is a parse error.
3720 Ignore the token. */
3721 if (!$this->elementInScope($token['name'], true)) {
3722 // Ignore
3724 /* Otherwise: */
3725 } else {
3726 /* Clear the stack back to a table body context. */
3727 $this->clearStackToTableContext($clear);
3729 /* Pop the current node from the stack of open elements. Switch
3730 the insertion mode to "in table". */
3731 array_pop($this->stack);
3732 $this->mode = self::IN_TABLE;
3735 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3736 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3737 } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3738 $token['name'],
3739 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3740 )) ||
3741 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3743 /* If the stack of open elements does not have a tbody, thead, or
3744 tfoot element in table scope, this is a parse error. Ignore the
3745 token. (innerHTML case) */
3746 if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3747 // Ignore.
3749 /* Otherwise: */
3750 } else {
3751 /* Clear the stack back to a table body context. */
3752 $this->clearStackToTableContext($clear);
3754 /* Act as if an end tag with the same tag name as the current
3755 node ("tbody", "tfoot", or "thead") had been seen, then
3756 reprocess the current token. */
3757 $this->inTableBody(
3758 array(
3759 'name' => end($this->stack)->nodeName,
3760 'type' => HTML5::ENDTAG
3764 return $this->mainPhase($token);
3767 /* An end tag whose tag name is one of: "body", "caption", "col",
3768 "colgroup", "html", "td", "th", "tr" */
3769 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3770 $token['name'],
3771 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3774 /* Parse error. Ignore the token. */
3776 /* Anything else */
3777 } else {
3778 /* Process the token as if the insertion mode was "in table". */
3779 $this->inTable($token);
3783 private function inRow($token)
3785 $clear = array('tr', 'html');
3787 /* A start tag whose tag name is one of: "th", "td" */
3788 if ($token['type'] === HTML5::STARTTAG &&
3789 ($token['name'] === 'th' || $token['name'] === 'td')
3791 /* Clear the stack back to a table row context. */
3792 $this->clearStackToTableContext($clear);
3794 /* Insert an HTML element for the token, then switch the insertion
3795 mode to "in cell". */
3796 $this->insertElement($token);
3797 $this->mode = self::IN_CELL;
3799 /* Insert a marker at the end of the list of active formatting
3800 elements. */
3801 $this->a_formatting[] = self::MARKER;
3803 /* An end tag whose tag name is "tr" */
3804 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3805 /* If the stack of open elements does not have an element in table
3806 scope with the same tag name as the token, this is a parse error.
3807 Ignore the token. (innerHTML case) */
3808 if (!$this->elementInScope($token['name'], true)) {
3809 // Ignore.
3811 /* Otherwise: */
3812 } else {
3813 /* Clear the stack back to a table row context. */
3814 $this->clearStackToTableContext($clear);
3816 /* Pop the current node (which will be a tr element) from the
3817 stack of open elements. Switch the insertion mode to "in table
3818 body". */
3819 array_pop($this->stack);
3820 $this->mode = self::IN_TBODY;
3823 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3824 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3825 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3826 $token['name'],
3827 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3830 /* Act as if an end tag with the tag name "tr" had been seen, then,
3831 if that token wasn't ignored, reprocess the current token. */
3832 $this->inRow(
3833 array(
3834 'name' => 'tr',
3835 'type' => HTML5::ENDTAG
3839 return $this->inCell($token);
3841 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3842 } elseif ($token['type'] === HTML5::ENDTAG &&
3843 in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3845 /* If the stack of open elements does not have an element in table
3846 scope with the same tag name as the token, this is a parse error.
3847 Ignore the token. */
3848 if (!$this->elementInScope($token['name'], true)) {
3849 // Ignore.
3851 /* Otherwise: */
3852 } else {
3853 /* Otherwise, act as if an end tag with the tag name "tr" had
3854 been seen, then reprocess the current token. */
3855 $this->inRow(
3856 array(
3857 'name' => 'tr',
3858 'type' => HTML5::ENDTAG
3862 return $this->inCell($token);
3865 /* An end tag whose tag name is one of: "body", "caption", "col",
3866 "colgroup", "html", "td", "th" */
3867 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3868 $token['name'],
3869 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3872 /* Parse error. Ignore the token. */
3874 /* Anything else */
3875 } else {
3876 /* Process the token as if the insertion mode was "in table". */
3877 $this->inTable($token);
3881 private function inCell($token)
3883 /* An end tag whose tag name is one of: "td", "th" */
3884 if ($token['type'] === HTML5::ENDTAG &&
3885 ($token['name'] === 'td' || $token['name'] === 'th')
3887 /* If the stack of open elements does not have an element in table
3888 scope with the same tag name as that of the token, then this is a
3889 parse error and the token must be ignored. */
3890 if (!$this->elementInScope($token['name'], true)) {
3891 // Ignore.
3893 /* Otherwise: */
3894 } else {
3895 /* Generate implied end tags, except for elements with the same
3896 tag name as the token. */
3897 $this->generateImpliedEndTags(array($token['name']));
3899 /* Now, if the current node is not an element with the same tag
3900 name as the token, then this is a parse error. */
3901 // k
3903 /* Pop elements from this stack until an element with the same
3904 tag name as the token has been popped from the stack. */
3905 while (true) {
3906 $node = end($this->stack)->nodeName;
3907 array_pop($this->stack);
3909 if ($node === $token['name']) {
3910 break;
3914 /* Clear the list of active formatting elements up to the last
3915 marker. */
3916 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3918 /* Switch the insertion mode to "in row". (The current node
3919 will be a tr element at this point.) */
3920 $this->mode = self::IN_ROW;
3923 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3924 "tbody", "td", "tfoot", "th", "thead", "tr" */
3925 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3926 $token['name'],
3927 array(
3928 'caption',
3929 'col',
3930 'colgroup',
3931 'tbody',
3932 'td',
3933 'tfoot',
3934 'th',
3935 'thead',
3936 'tr'
3940 /* If the stack of open elements does not have a td or th element
3941 in table scope, then this is a parse error; ignore the token.
3942 (innerHTML case) */
3943 if (!$this->elementInScope(array('td', 'th'), true)) {
3944 // Ignore.
3946 /* Otherwise, close the cell (see below) and reprocess the current
3947 token. */
3948 } else {
3949 $this->closeCell();
3950 return $this->inRow($token);
3953 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3954 "tbody", "td", "tfoot", "th", "thead", "tr" */
3955 } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3956 $token['name'],
3957 array(
3958 'caption',
3959 'col',
3960 'colgroup',
3961 'tbody',
3962 'td',
3963 'tfoot',
3964 'th',
3965 'thead',
3966 'tr'
3970 /* If the stack of open elements does not have a td or th element
3971 in table scope, then this is a parse error; ignore the token.
3972 (innerHTML case) */
3973 if (!$this->elementInScope(array('td', 'th'), true)) {
3974 // Ignore.
3976 /* Otherwise, close the cell (see below) and reprocess the current
3977 token. */
3978 } else {
3979 $this->closeCell();
3980 return $this->inRow($token);
3983 /* An end tag whose tag name is one of: "body", "caption", "col",
3984 "colgroup", "html" */
3985 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3986 $token['name'],
3987 array('body', 'caption', 'col', 'colgroup', 'html')
3990 /* Parse error. Ignore the token. */
3992 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3993 "thead", "tr" */
3994 } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3995 $token['name'],
3996 array('table', 'tbody', 'tfoot', 'thead', 'tr')
3999 /* If the stack of open elements does not have an element in table
4000 scope with the same tag name as that of the token (which can only
4001 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4002 then this is a parse error and the token must be ignored. */
4003 if (!$this->elementInScope($token['name'], true)) {
4004 // Ignore.
4006 /* Otherwise, close the cell (see below) and reprocess the current
4007 token. */
4008 } else {
4009 $this->closeCell();
4010 return $this->inRow($token);
4013 /* Anything else */
4014 } else {
4015 /* Process the token as if the insertion mode was "in body". */
4016 $this->inBody($token);
4020 private function inSelect($token)
4022 /* Handle the token as follows: */
4024 /* A character token */
4025 if ($token['type'] === HTML5::CHARACTR) {
4026 /* Append the token's character to the current node. */
4027 $this->insertText($token['data']);
4029 /* A comment token */
4030 } elseif ($token['type'] === HTML5::COMMENT) {
4031 /* Append a Comment node to the current node with the data
4032 attribute set to the data given in the comment token. */
4033 $this->insertComment($token['data']);
4035 /* A start tag token whose tag name is "option" */
4036 } elseif ($token['type'] === HTML5::STARTTAG &&
4037 $token['name'] === 'option'
4039 /* If the current node is an option element, act as if an end tag
4040 with the tag name "option" had been seen. */
4041 if (end($this->stack)->nodeName === 'option') {
4042 $this->inSelect(
4043 array(
4044 'name' => 'option',
4045 'type' => HTML5::ENDTAG
4050 /* Insert an HTML element for the token. */
4051 $this->insertElement($token);
4053 /* A start tag token whose tag name is "optgroup" */
4054 } elseif ($token['type'] === HTML5::STARTTAG &&
4055 $token['name'] === 'optgroup'
4057 /* If the current node is an option element, act as if an end tag
4058 with the tag name "option" had been seen. */
4059 if (end($this->stack)->nodeName === 'option') {
4060 $this->inSelect(
4061 array(
4062 'name' => 'option',
4063 'type' => HTML5::ENDTAG
4068 /* If the current node is an optgroup element, act as if an end tag
4069 with the tag name "optgroup" had been seen. */
4070 if (end($this->stack)->nodeName === 'optgroup') {
4071 $this->inSelect(
4072 array(
4073 'name' => 'optgroup',
4074 'type' => HTML5::ENDTAG
4079 /* Insert an HTML element for the token. */
4080 $this->insertElement($token);
4082 /* An end tag token whose tag name is "optgroup" */
4083 } elseif ($token['type'] === HTML5::ENDTAG &&
4084 $token['name'] === 'optgroup'
4086 /* First, if the current node is an option element, and the node
4087 immediately before it in the stack of open elements is an optgroup
4088 element, then act as if an end tag with the tag name "option" had
4089 been seen. */
4090 $elements_in_stack = count($this->stack);
4092 if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4093 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4095 $this->inSelect(
4096 array(
4097 'name' => 'option',
4098 'type' => HTML5::ENDTAG
4103 /* If the current node is an optgroup element, then pop that node
4104 from the stack of open elements. Otherwise, this is a parse error,
4105 ignore the token. */
4106 if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4107 array_pop($this->stack);
4110 /* An end tag token whose tag name is "option" */
4111 } elseif ($token['type'] === HTML5::ENDTAG &&
4112 $token['name'] === 'option'
4114 /* If the current node is an option element, then pop that node
4115 from the stack of open elements. Otherwise, this is a parse error,
4116 ignore the token. */
4117 if (end($this->stack)->nodeName === 'option') {
4118 array_pop($this->stack);
4121 /* An end tag whose tag name is "select" */
4122 } elseif ($token['type'] === HTML5::ENDTAG &&
4123 $token['name'] === 'select'
4125 /* If the stack of open elements does not have an element in table
4126 scope with the same tag name as the token, this is a parse error.
4127 Ignore the token. (innerHTML case) */
4128 if (!$this->elementInScope($token['name'], true)) {
4129 // w/e
4131 /* Otherwise: */
4132 } else {
4133 /* Pop elements from the stack of open elements until a select
4134 element has been popped from the stack. */
4135 while (true) {
4136 $current = end($this->stack)->nodeName;
4137 array_pop($this->stack);
4139 if ($current === 'select') {
4140 break;
4144 /* Reset the insertion mode appropriately. */
4145 $this->resetInsertionMode();
4148 /* A start tag whose tag name is "select" */
4149 } elseif ($token['name'] === 'select' &&
4150 $token['type'] === HTML5::STARTTAG
4152 /* Parse error. Act as if the token had been an end tag with the
4153 tag name "select" instead. */
4154 $this->inSelect(
4155 array(
4156 'name' => 'select',
4157 'type' => HTML5::ENDTAG
4161 /* An end tag whose tag name is one of: "caption", "table", "tbody",
4162 "tfoot", "thead", "tr", "td", "th" */
4163 } elseif (in_array(
4164 $token['name'],
4165 array(
4166 'caption',
4167 'table',
4168 'tbody',
4169 'tfoot',
4170 'thead',
4171 'tr',
4172 'td',
4173 'th'
4175 ) && $token['type'] === HTML5::ENDTAG
4177 /* Parse error. */
4178 // w/e
4180 /* If the stack of open elements has an element in table scope with
4181 the same tag name as that of the token, then act as if an end tag
4182 with the tag name "select" had been seen, and reprocess the token.
4183 Otherwise, ignore the token. */
4184 if ($this->elementInScope($token['name'], true)) {
4185 $this->inSelect(
4186 array(
4187 'name' => 'select',
4188 'type' => HTML5::ENDTAG
4192 $this->mainPhase($token);
4195 /* Anything else */
4196 } else {
4197 /* Parse error. Ignore the token. */
4201 private function afterBody($token)
4203 /* Handle the token as follows: */
4205 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4206 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4207 or U+0020 SPACE */
4208 if ($token['type'] === HTML5::CHARACTR &&
4209 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4211 /* Process the token as it would be processed if the insertion mode
4212 was "in body". */
4213 $this->inBody($token);
4215 /* A comment token */
4216 } elseif ($token['type'] === HTML5::COMMENT) {
4217 /* Append a Comment node to the first element in the stack of open
4218 elements (the html element), with the data attribute set to the
4219 data given in the comment token. */
4220 $comment = $this->dom->createComment($token['data']);
4221 $this->stack[0]->appendChild($comment);
4223 /* An end tag with the tag name "html" */
4224 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4225 /* If the parser was originally created in order to handle the
4226 setting of an element's innerHTML attribute, this is a parse error;
4227 ignore the token. (The element will be an html element in this
4228 case.) (innerHTML case) */
4230 /* Otherwise, switch to the trailing end phase. */
4231 $this->phase = self::END_PHASE;
4233 /* Anything else */
4234 } else {
4235 /* Parse error. Set the insertion mode to "in body" and reprocess
4236 the token. */
4237 $this->mode = self::IN_BODY;
4238 return $this->inBody($token);
4242 private function inFrameset($token)
4244 /* Handle the token as follows: */
4246 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4247 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4248 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4249 if ($token['type'] === HTML5::CHARACTR &&
4250 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4252 /* Append the character to the current node. */
4253 $this->insertText($token['data']);
4255 /* A comment token */
4256 } elseif ($token['type'] === HTML5::COMMENT) {
4257 /* Append a Comment node to the current node with the data
4258 attribute set to the data given in the comment token. */
4259 $this->insertComment($token['data']);
4261 /* A start tag with the tag name "frameset" */
4262 } elseif ($token['name'] === 'frameset' &&
4263 $token['type'] === HTML5::STARTTAG
4265 $this->insertElement($token);
4267 /* An end tag with the tag name "frameset" */
4268 } elseif ($token['name'] === 'frameset' &&
4269 $token['type'] === HTML5::ENDTAG
4271 /* If the current node is the root html element, then this is a
4272 parse error; ignore the token. (innerHTML case) */
4273 if (end($this->stack)->nodeName === 'html') {
4274 // Ignore
4276 } else {
4277 /* Otherwise, pop the current node from the stack of open
4278 elements. */
4279 array_pop($this->stack);
4281 /* If the parser was not originally created in order to handle
4282 the setting of an element's innerHTML attribute (innerHTML case),
4283 and the current node is no longer a frameset element, then change
4284 the insertion mode to "after frameset". */
4285 $this->mode = self::AFTR_FRAME;
4288 /* A start tag with the tag name "frame" */
4289 } elseif ($token['name'] === 'frame' &&
4290 $token['type'] === HTML5::STARTTAG
4292 /* Insert an HTML element for the token. */
4293 $this->insertElement($token);
4295 /* Immediately pop the current node off the stack of open elements. */
4296 array_pop($this->stack);
4298 /* A start tag with the tag name "noframes" */
4299 } elseif ($token['name'] === 'noframes' &&
4300 $token['type'] === HTML5::STARTTAG
4302 /* Process the token as if the insertion mode had been "in body". */
4303 $this->inBody($token);
4305 /* Anything else */
4306 } else {
4307 /* Parse error. Ignore the token. */
4311 private function afterFrameset($token)
4313 /* Handle the token as follows: */
4315 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4316 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4317 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4318 if ($token['type'] === HTML5::CHARACTR &&
4319 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4321 /* Append the character to the current node. */
4322 $this->insertText($token['data']);
4324 /* A comment token */
4325 } elseif ($token['type'] === HTML5::COMMENT) {
4326 /* Append a Comment node to the current node with the data
4327 attribute set to the data given in the comment token. */
4328 $this->insertComment($token['data']);
4330 /* An end tag with the tag name "html" */
4331 } elseif ($token['name'] === 'html' &&
4332 $token['type'] === HTML5::ENDTAG
4334 /* Switch to the trailing end phase. */
4335 $this->phase = self::END_PHASE;
4337 /* A start tag with the tag name "noframes" */
4338 } elseif ($token['name'] === 'noframes' &&
4339 $token['type'] === HTML5::STARTTAG
4341 /* Process the token as if the insertion mode had been "in body". */
4342 $this->inBody($token);
4344 /* Anything else */
4345 } else {
4346 /* Parse error. Ignore the token. */
4350 private function trailingEndPhase($token)
4352 /* After the main phase, as each token is emitted from the tokenisation
4353 stage, it must be processed as described in this section. */
4355 /* A DOCTYPE token */
4356 if ($token['type'] === HTML5::DOCTYPE) {
4357 // Parse error. Ignore the token.
4359 /* A comment token */
4360 } elseif ($token['type'] === HTML5::COMMENT) {
4361 /* Append a Comment node to the Document object with the data
4362 attribute set to the data given in the comment token. */
4363 $comment = $this->dom->createComment($token['data']);
4364 $this->dom->appendChild($comment);
4366 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4367 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4368 or U+0020 SPACE */
4369 } elseif ($token['type'] === HTML5::CHARACTR &&
4370 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4372 /* Process the token as it would be processed in the main phase. */
4373 $this->mainPhase($token);
4375 /* A character token that is not one of U+0009 CHARACTER TABULATION,
4376 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4377 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4378 } elseif (($token['type'] === HTML5::CHARACTR &&
4379 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4380 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4382 /* Parse error. Switch back to the main phase and reprocess the
4383 token. */
4384 $this->phase = self::MAIN_PHASE;
4385 return $this->mainPhase($token);
4387 /* An end-of-file token */
4388 } elseif ($token['type'] === HTML5::EOF) {
4389 /* OMG DONE!! */
4393 private function insertElement($token, $append = true, $check = false)
4395 // Proprietary workaround for libxml2's limitations with tag names
4396 if ($check) {
4397 // Slightly modified HTML5 tag-name modification,
4398 // removing anything that's not an ASCII letter, digit, or hyphen
4399 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4400 // Remove leading hyphens and numbers
4401 $token['name'] = ltrim($token['name'], '-0..9');
4402 // In theory, this should ever be needed, but just in case
4403 if ($token['name'] === '') {
4404 $token['name'] = 'span';
4405 } // arbitrary generic choice
4408 $el = $this->dom->createElement($token['name']);
4410 foreach ($token['attr'] as $attr) {
4411 if (!$el->hasAttribute($attr['name'])) {
4412 $el->setAttribute($attr['name'], $attr['value']);
4416 $this->appendToRealParent($el);
4417 $this->stack[] = $el;
4419 return $el;
4422 private function insertText($data)
4424 $text = $this->dom->createTextNode($data);
4425 $this->appendToRealParent($text);
4428 private function insertComment($data)
4430 $comment = $this->dom->createComment($data);
4431 $this->appendToRealParent($comment);
4434 private function appendToRealParent($node)
4436 if ($this->foster_parent === null) {
4437 end($this->stack)->appendChild($node);
4439 } elseif ($this->foster_parent !== null) {
4440 /* If the foster parent element is the parent element of the
4441 last table element in the stack of open elements, then the new
4442 node must be inserted immediately before the last table element
4443 in the stack of open elements in the foster parent element;
4444 otherwise, the new node must be appended to the foster parent
4445 element. */
4446 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4447 if ($this->stack[$n]->nodeName === 'table' &&
4448 $this->stack[$n]->parentNode !== null
4450 $table = $this->stack[$n];
4451 break;
4455 if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4456 $this->foster_parent->insertBefore($node, $table);
4457 } else {
4458 $this->foster_parent->appendChild($node);
4461 $this->foster_parent = null;
4465 private function elementInScope($el, $table = false)
4467 if (is_array($el)) {
4468 foreach ($el as $element) {
4469 if ($this->elementInScope($element, $table)) {
4470 return true;
4474 return false;
4477 $leng = count($this->stack);
4479 for ($n = 0; $n < $leng; $n++) {
4480 /* 1. Initialise node to be the current node (the bottommost node of
4481 the stack). */
4482 $node = $this->stack[$leng - 1 - $n];
4484 if ($node->tagName === $el) {
4485 /* 2. If node is the target node, terminate in a match state. */
4486 return true;
4488 } elseif ($node->tagName === 'table') {
4489 /* 3. Otherwise, if node is a table element, terminate in a failure
4490 state. */
4491 return false;
4493 } elseif ($table === true && in_array(
4494 $node->tagName,
4495 array(
4496 'caption',
4497 'td',
4498 'th',
4499 'button',
4500 'marquee',
4501 'object'
4505 /* 4. Otherwise, if the algorithm is the "has an element in scope"
4506 variant (rather than the "has an element in table scope" variant),
4507 and node is one of the following, terminate in a failure state. */
4508 return false;
4510 } elseif ($node === $node->ownerDocument->documentElement) {
4511 /* 5. Otherwise, if node is an html element (root element), terminate
4512 in a failure state. (This can only happen if the node is the topmost
4513 node of the stack of open elements, and prevents the next step from
4514 being invoked if there are no more elements in the stack.) */
4515 return false;
4518 /* Otherwise, set node to the previous entry in the stack of open
4519 elements and return to step 2. (This will never fail, since the loop
4520 will always terminate in the previous step if the top of the stack
4521 is reached.) */
4525 private function reconstructActiveFormattingElements()
4527 /* 1. If there are no entries in the list of active formatting elements,
4528 then there is nothing to reconstruct; stop this algorithm. */
4529 $formatting_elements = count($this->a_formatting);
4531 if ($formatting_elements === 0) {
4532 return false;
4535 /* 3. Let entry be the last (most recently added) element in the list
4536 of active formatting elements. */
4537 $entry = end($this->a_formatting);
4539 /* 2. If the last (most recently added) entry in the list of active
4540 formatting elements is a marker, or if it is an element that is in the
4541 stack of open elements, then there is nothing to reconstruct; stop this
4542 algorithm. */
4543 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4544 return false;
4547 for ($a = $formatting_elements - 1; $a >= 0; true) {
4548 /* 4. If there are no entries before entry in the list of active
4549 formatting elements, then jump to step 8. */
4550 if ($a === 0) {
4551 $step_seven = false;
4552 break;
4555 /* 5. Let entry be the entry one earlier than entry in the list of
4556 active formatting elements. */
4557 $a--;
4558 $entry = $this->a_formatting[$a];
4560 /* 6. If entry is neither a marker nor an element that is also in
4561 thetack of open elements, go to step 4. */
4562 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4563 break;
4567 while (true) {
4568 /* 7. Let entry be the element one later than entry in the list of
4569 active formatting elements. */
4570 if (isset($step_seven) && $step_seven === true) {
4571 $a++;
4572 $entry = $this->a_formatting[$a];
4575 /* 8. Perform a shallow clone of the element entry to obtain clone. */
4576 $clone = $entry->cloneNode();
4578 /* 9. Append clone to the current node and push it onto the stack
4579 of open elements so that it is the new current node. */
4580 end($this->stack)->appendChild($clone);
4581 $this->stack[] = $clone;
4583 /* 10. Replace the entry for entry in the list with an entry for
4584 clone. */
4585 $this->a_formatting[$a] = $clone;
4587 /* 11. If the entry for clone in the list of active formatting
4588 elements is not the last entry in the list, return to step 7. */
4589 if (end($this->a_formatting) !== $clone) {
4590 $step_seven = true;
4591 } else {
4592 break;
4597 private function clearTheActiveFormattingElementsUpToTheLastMarker()
4599 /* When the steps below require the UA to clear the list of active
4600 formatting elements up to the last marker, the UA must perform the
4601 following steps: */
4603 while (true) {
4604 /* 1. Let entry be the last (most recently added) entry in the list
4605 of active formatting elements. */
4606 $entry = end($this->a_formatting);
4608 /* 2. Remove entry from the list of active formatting elements. */
4609 array_pop($this->a_formatting);
4611 /* 3. If entry was a marker, then stop the algorithm at this point.
4612 The list has been cleared up to the last marker. */
4613 if ($entry === self::MARKER) {
4614 break;
4619 private function generateImpliedEndTags($exclude = array())
4621 /* When the steps below require the UA to generate implied end tags,
4622 then, if the current node is a dd element, a dt element, an li element,
4623 a p element, a td element, a th element, or a tr element, the UA must
4624 act as if an end tag with the respective tag name had been seen and
4625 then generate implied end tags again. */
4626 $node = end($this->stack);
4627 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4629 while (in_array(end($this->stack)->nodeName, $elements)) {
4630 array_pop($this->stack);
4634 private function getElementCategory($node)
4636 $name = $node->tagName;
4637 if (in_array($name, $this->special)) {
4638 return self::SPECIAL;
4639 } elseif (in_array($name, $this->scoping)) {
4640 return self::SCOPING;
4641 } elseif (in_array($name, $this->formatting)) {
4642 return self::FORMATTING;
4643 } else {
4644 return self::PHRASING;
4648 private function clearStackToTableContext($elements)
4650 /* When the steps above require the UA to clear the stack back to a
4651 table context, it means that the UA must, while the current node is not
4652 a table element or an html element, pop elements from the stack of open
4653 elements. If this causes any elements to be popped from the stack, then
4654 this is a parse error. */
4655 while (true) {
4656 $node = end($this->stack)->nodeName;
4658 if (in_array($node, $elements)) {
4659 break;
4660 } else {
4661 array_pop($this->stack);
4666 private function resetInsertionMode()
4668 /* 1. Let last be false. */
4669 $last = false;
4670 $leng = count($this->stack);
4672 for ($n = $leng - 1; $n >= 0; $n--) {
4673 /* 2. Let node be the last node in the stack of open elements. */
4674 $node = $this->stack[$n];
4676 /* 3. If node is the first node in the stack of open elements, then
4677 set last to true. If the element whose innerHTML attribute is being
4678 set is neither a td element nor a th element, then set node to the
4679 element whose innerHTML attribute is being set. (innerHTML case) */
4680 if ($this->stack[0]->isSameNode($node)) {
4681 $last = true;
4684 /* 4. If node is a select element, then switch the insertion mode to
4685 "in select" and abort these steps. (innerHTML case) */
4686 if ($node->nodeName === 'select') {
4687 $this->mode = self::IN_SELECT;
4688 break;
4690 /* 5. If node is a td or th element, then switch the insertion mode
4691 to "in cell" and abort these steps. */
4692 } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4693 $this->mode = self::IN_CELL;
4694 break;
4696 /* 6. If node is a tr element, then switch the insertion mode to
4697 "in row" and abort these steps. */
4698 } elseif ($node->nodeName === 'tr') {
4699 $this->mode = self::IN_ROW;
4700 break;
4702 /* 7. If node is a tbody, thead, or tfoot element, then switch the
4703 insertion mode to "in table body" and abort these steps. */
4704 } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4705 $this->mode = self::IN_TBODY;
4706 break;
4708 /* 8. If node is a caption element, then switch the insertion mode
4709 to "in caption" and abort these steps. */
4710 } elseif ($node->nodeName === 'caption') {
4711 $this->mode = self::IN_CAPTION;
4712 break;
4714 /* 9. If node is a colgroup element, then switch the insertion mode
4715 to "in column group" and abort these steps. (innerHTML case) */
4716 } elseif ($node->nodeName === 'colgroup') {
4717 $this->mode = self::IN_CGROUP;
4718 break;
4720 /* 10. If node is a table element, then switch the insertion mode
4721 to "in table" and abort these steps. */
4722 } elseif ($node->nodeName === 'table') {
4723 $this->mode = self::IN_TABLE;
4724 break;
4726 /* 11. If node is a head element, then switch the insertion mode
4727 to "in body" ("in body"! not "in head"!) and abort these steps.
4728 (innerHTML case) */
4729 } elseif ($node->nodeName === 'head') {
4730 $this->mode = self::IN_BODY;
4731 break;
4733 /* 12. If node is a body element, then switch the insertion mode to
4734 "in body" and abort these steps. */
4735 } elseif ($node->nodeName === 'body') {
4736 $this->mode = self::IN_BODY;
4737 break;
4739 /* 13. If node is a frameset element, then switch the insertion
4740 mode to "in frameset" and abort these steps. (innerHTML case) */
4741 } elseif ($node->nodeName === 'frameset') {
4742 $this->mode = self::IN_FRAME;
4743 break;
4745 /* 14. If node is an html element, then: if the head element
4746 pointer is null, switch the insertion mode to "before head",
4747 otherwise, switch the insertion mode to "after head". In either
4748 case, abort these steps. (innerHTML case) */
4749 } elseif ($node->nodeName === 'html') {
4750 $this->mode = ($this->head_pointer === null)
4751 ? self::BEFOR_HEAD
4752 : self::AFTER_HEAD;
4754 break;
4756 /* 15. If last is true, then set the insertion mode to "in body"
4757 and abort these steps. (innerHTML case) */
4758 } elseif ($last) {
4759 $this->mode = self::IN_BODY;
4760 break;
4765 private function closeCell()
4767 /* If the stack of open elements has a td or th element in table scope,
4768 then act as if an end tag token with that tag name had been seen. */
4769 foreach (array('td', 'th') as $cell) {
4770 if ($this->elementInScope($cell, true)) {
4771 $this->inCell(
4772 array(
4773 'name' => $cell,
4774 'type' => HTML5::ENDTAG
4778 break;
4783 public function save()
4785 return $this->dom;