[3.1.0] Fixed fatal error in PH5P lexer with invalid tag names
[htmlpurifier.git] / library / HTMLPurifier / Lexer / PH5P.php
blob9fa92448c0fbda50ee44fa58d5cb2268322797dd
1 <?php
3 /**
4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Requires PHP5, and occupies space in the HTML5 pseudo-namespace (may
6 * cause conflicts, sorry).
7 */
9 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
11 public function tokenizeHTML($html, $config, $context) {
12 $html = $this->normalize($html, $config, $context);
13 $html = $this->wrapHTML( $html, $config, $context);
14 $parser = new HTML5($html);
15 $doc = $parser->save();
16 $tokens = array();
17 $this->tokenizeDOM(
18 $doc->getElementsByTagName('html')->item(0)-> // <html>
19 getElementsByTagName('body')->item(0)-> // <body>
20 getElementsByTagName('div')->item(0) // <div>
21 , $tokens);
22 return $tokens;
29 Copyright 2007 Jeroen van der Meer <http://jero.net/>
31 Permission is hereby granted, free of charge, to any person obtaining a
32 copy of this software and associated documentation files (the
33 "Software"), to deal in the Software without restriction, including
34 without limitation the rights to use, copy, modify, merge, publish,
35 distribute, sublicense, and/or sell copies of the Software, and to
36 permit persons to whom the Software is furnished to do so, subject to
37 the following conditions:
39 The above copyright notice and this permission notice shall be included
40 in all copies or substantial portions of the Software.
42 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
43 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
45 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
46 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
47 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
48 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
52 class HTML5 {
53 private $data;
54 private $char;
55 private $EOF;
56 private $state;
57 private $tree;
58 private $token;
59 private $content_model;
60 private $escape = false;
61 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
62 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
63 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
64 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
65 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
66 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
67 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
68 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
69 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
70 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
71 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
72 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
73 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
74 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
75 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
76 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
77 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
78 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
79 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
80 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
81 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
82 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
83 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
84 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
85 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
86 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
87 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
88 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
89 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
90 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
91 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
92 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
93 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
94 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
95 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
96 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
97 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
98 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
99 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
100 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
101 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
102 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
104 const PCDATA = 0;
105 const RCDATA = 1;
106 const CDATA = 2;
107 const PLAINTEXT = 3;
109 const DOCTYPE = 0;
110 const STARTTAG = 1;
111 const ENDTAG = 2;
112 const COMMENT = 3;
113 const CHARACTR = 4;
114 const EOF = 5;
116 public function __construct($data) {
117 $data = str_replace("\r\n", "\n", $data);
118 $data = str_replace("\r", null, $data);
120 $this->data = $data;
121 $this->char = -1;
122 $this->EOF = strlen($data);
123 $this->tree = new HTML5TreeConstructer;
124 $this->content_model = self::PCDATA;
126 $this->state = 'data';
128 while($this->state !== null) {
129 $this->{$this->state.'State'}();
133 public function save() {
134 return $this->tree->save();
137 private function char() {
138 return ($this->char < $this->EOF)
139 ? $this->data[$this->char]
140 : false;
143 private function character($s, $l = 0) {
144 if($s + $l < $this->EOF) {
145 if($l === 0) {
146 return $this->data[$s];
147 } else {
148 return substr($this->data, $s, $l);
153 private function characters($char_class, $start) {
154 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
157 private function dataState() {
158 // Consume the next input character
159 $this->char++;
160 $char = $this->char();
162 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
163 /* U+0026 AMPERSAND (&)
164 When the content model flag is set to one of the PCDATA or RCDATA
165 states: switch to the entity data state. Otherwise: treat it as per
166 the "anything else" entry below. */
167 $this->state = 'entityData';
169 } elseif($char === '-') {
170 /* If the content model flag is set to either the RCDATA state or
171 the CDATA state, and the escape flag is false, and there are at
172 least three characters before this one in the input stream, and the
173 last four characters in the input stream, including this one, are
174 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
175 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
176 if(($this->content_model === self::RCDATA || $this->content_model ===
177 self::CDATA) && $this->escape === false &&
178 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
179 $this->escape = true;
182 /* In any case, emit the input character as a character token. Stay
183 in the data state. */
184 $this->emitToken(array(
185 'type' => self::CHARACTR,
186 'data' => $char
189 /* U+003C LESS-THAN SIGN (<) */
190 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
191 (($this->content_model === self::RCDATA ||
192 $this->content_model === self::CDATA) && $this->escape === false))) {
193 /* When the content model flag is set to the PCDATA state: switch
194 to the tag open state.
196 When the content model flag is set to either the RCDATA state or
197 the CDATA state and the escape flag is false: switch to the tag
198 open state.
200 Otherwise: treat it as per the "anything else" entry below. */
201 $this->state = 'tagOpen';
203 /* U+003E GREATER-THAN SIGN (>) */
204 } elseif($char === '>') {
205 /* If the content model flag is set to either the RCDATA state or
206 the CDATA state, and the escape flag is true, and the last three
207 characters in the input stream including this one are U+002D
208 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
209 set the escape flag to false. */
210 if(($this->content_model === self::RCDATA ||
211 $this->content_model === self::CDATA) && $this->escape === true &&
212 $this->character($this->char, 3) === '-->') {
213 $this->escape = false;
216 /* In any case, emit the input character as a character token.
217 Stay in the data state. */
218 $this->emitToken(array(
219 'type' => self::CHARACTR,
220 'data' => $char
223 } elseif($this->char === $this->EOF) {
224 /* EOF
225 Emit an end-of-file token. */
226 $this->EOF();
228 } elseif($this->content_model === self::PLAINTEXT) {
229 /* When the content model flag is set to the PLAINTEXT state
230 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
231 the text and emit it as a character token. */
232 $this->emitToken(array(
233 'type' => self::CHARACTR,
234 'data' => substr($this->data, $this->char)
237 $this->EOF();
239 } else {
240 /* Anything else
241 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
242 otherwise would also be treated as a character token and emit it
243 as a single character token. Stay in the data state. */
244 $len = strcspn($this->data, '<&', $this->char);
245 $char = substr($this->data, $this->char, $len);
246 $this->char += $len - 1;
248 $this->emitToken(array(
249 'type' => self::CHARACTR,
250 'data' => $char
253 $this->state = 'data';
257 private function entityDataState() {
258 // Attempt to consume an entity.
259 $entity = $this->entity();
261 // If nothing is returned, emit a U+0026 AMPERSAND character token.
262 // Otherwise, emit the character token that was returned.
263 $char = (!$entity) ? '&' : $entity;
264 $this->emitToken(array(
265 'type' => self::CHARACTR,
266 'data' => $char
269 // Finally, switch to the data state.
270 $this->state = 'data';
273 private function tagOpenState() {
274 switch($this->content_model) {
275 case self::RCDATA:
276 case self::CDATA:
277 /* If the next input character is a U+002F SOLIDUS (/) character,
278 consume it and switch to the close tag open state. If the next
279 input character is not a U+002F SOLIDUS (/) character, emit a
280 U+003C LESS-THAN SIGN character token and switch to the data
281 state to process the next input character. */
282 if($this->character($this->char + 1) === '/') {
283 $this->char++;
284 $this->state = 'closeTagOpen';
286 } else {
287 $this->emitToken(array(
288 'type' => self::CHARACTR,
289 'data' => '<'
292 $this->state = 'data';
294 break;
296 case self::PCDATA:
297 // If the content model flag is set to the PCDATA state
298 // Consume the next input character:
299 $this->char++;
300 $char = $this->char();
302 if($char === '!') {
303 /* U+0021 EXCLAMATION MARK (!)
304 Switch to the markup declaration open state. */
305 $this->state = 'markupDeclarationOpen';
307 } elseif($char === '/') {
308 /* U+002F SOLIDUS (/)
309 Switch to the close tag open state. */
310 $this->state = 'closeTagOpen';
312 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
313 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
314 Create a new start tag token, set its tag name to the lowercase
315 version of the input character (add 0x0020 to the character's code
316 point), then switch to the tag name state. (Don't emit the token
317 yet; further details will be filled in before it is emitted.) */
318 $this->token = array(
319 'name' => strtolower($char),
320 'type' => self::STARTTAG,
321 'attr' => array()
324 $this->state = 'tagName';
326 } elseif($char === '>') {
327 /* U+003E GREATER-THAN SIGN (>)
328 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
329 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
330 $this->emitToken(array(
331 'type' => self::CHARACTR,
332 'data' => '<>'
335 $this->state = 'data';
337 } elseif($char === '?') {
338 /* U+003F QUESTION MARK (?)
339 Parse error. Switch to the bogus comment state. */
340 $this->state = 'bogusComment';
342 } else {
343 /* Anything else
344 Parse error. Emit a U+003C LESS-THAN SIGN character token and
345 reconsume the current input character in the data state. */
346 $this->emitToken(array(
347 'type' => self::CHARACTR,
348 'data' => '<'
351 $this->char--;
352 $this->state = 'data';
354 break;
358 private function closeTagOpenState() {
359 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
360 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
362 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
363 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
364 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
365 /* If the content model flag is set to the RCDATA or CDATA states then
366 examine the next few characters. If they do not match the tag name of
367 the last start tag token emitted (case insensitively), or if they do but
368 they are not immediately followed by one of the following characters:
369 * U+0009 CHARACTER TABULATION
370 * U+000A LINE FEED (LF)
371 * U+000B LINE TABULATION
372 * U+000C FORM FEED (FF)
373 * U+0020 SPACE
374 * U+003E GREATER-THAN SIGN (>)
375 * U+002F SOLIDUS (/)
376 * EOF
377 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
378 token, a U+002F SOLIDUS character token, and switch to the data state
379 to process the next input character. */
380 $this->emitToken(array(
381 'type' => self::CHARACTR,
382 'data' => '</'
385 $this->state = 'data';
387 } else {
388 /* Otherwise, if the content model flag is set to the PCDATA state,
389 or if the next few characters do match that tag name, consume the
390 next input character: */
391 $this->char++;
392 $char = $this->char();
394 if(preg_match('/^[A-Za-z]$/', $char)) {
395 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
396 Create a new end tag token, set its tag name to the lowercase version
397 of the input character (add 0x0020 to the character's code point), then
398 switch to the tag name state. (Don't emit the token yet; further details
399 will be filled in before it is emitted.) */
400 $this->token = array(
401 'name' => strtolower($char),
402 'type' => self::ENDTAG
405 $this->state = 'tagName';
407 } elseif($char === '>') {
408 /* U+003E GREATER-THAN SIGN (>)
409 Parse error. Switch to the data state. */
410 $this->state = 'data';
412 } elseif($this->char === $this->EOF) {
413 /* EOF
414 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
415 SOLIDUS character token. Reconsume the EOF character in the data state. */
416 $this->emitToken(array(
417 'type' => self::CHARACTR,
418 'data' => '</'
421 $this->char--;
422 $this->state = 'data';
424 } else {
425 /* Parse error. Switch to the bogus comment state. */
426 $this->state = 'bogusComment';
431 private function tagNameState() {
432 // Consume the next input character:
433 $this->char++;
434 $char = $this->character($this->char);
436 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
437 /* U+0009 CHARACTER TABULATION
438 U+000A LINE FEED (LF)
439 U+000B LINE TABULATION
440 U+000C FORM FEED (FF)
441 U+0020 SPACE
442 Switch to the before attribute name state. */
443 $this->state = 'beforeAttributeName';
445 } elseif($char === '>') {
446 /* U+003E GREATER-THAN SIGN (>)
447 Emit the current tag token. Switch to the data state. */
448 $this->emitToken($this->token);
449 $this->state = 'data';
451 } elseif($this->char === $this->EOF) {
452 /* EOF
453 Parse error. Emit the current tag token. Reconsume the EOF
454 character in the data state. */
455 $this->emitToken($this->token);
457 $this->char--;
458 $this->state = 'data';
460 } elseif($char === '/') {
461 /* U+002F SOLIDUS (/)
462 Parse error unless this is a permitted slash. Switch to the before
463 attribute name state. */
464 $this->state = 'beforeAttributeName';
466 } else {
467 /* Anything else
468 Append the current input character to the current tag token's tag name.
469 Stay in the tag name state. */
470 $this->token['name'] .= strtolower($char);
471 $this->state = 'tagName';
475 private function beforeAttributeNameState() {
476 // Consume the next input character:
477 $this->char++;
478 $char = $this->character($this->char);
480 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
481 /* U+0009 CHARACTER TABULATION
482 U+000A LINE FEED (LF)
483 U+000B LINE TABULATION
484 U+000C FORM FEED (FF)
485 U+0020 SPACE
486 Stay in the before attribute name state. */
487 $this->state = 'beforeAttributeName';
489 } elseif($char === '>') {
490 /* U+003E GREATER-THAN SIGN (>)
491 Emit the current tag token. Switch to the data state. */
492 $this->emitToken($this->token);
493 $this->state = 'data';
495 } elseif($char === '/') {
496 /* U+002F SOLIDUS (/)
497 Parse error unless this is a permitted slash. Stay in the before
498 attribute name state. */
499 $this->state = 'beforeAttributeName';
501 } elseif($this->char === $this->EOF) {
502 /* EOF
503 Parse error. Emit the current tag token. Reconsume the EOF
504 character in the data state. */
505 $this->emitToken($this->token);
507 $this->char--;
508 $this->state = 'data';
510 } else {
511 /* Anything else
512 Start a new attribute in the current tag token. Set that attribute's
513 name to the current input character, and its value to the empty string.
514 Switch to the attribute name state. */
515 $this->token['attr'][] = array(
516 'name' => strtolower($char),
517 'value' => null
520 $this->state = 'attributeName';
524 private function attributeNameState() {
525 // Consume the next input character:
526 $this->char++;
527 $char = $this->character($this->char);
529 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
530 /* U+0009 CHARACTER TABULATION
531 U+000A LINE FEED (LF)
532 U+000B LINE TABULATION
533 U+000C FORM FEED (FF)
534 U+0020 SPACE
535 Stay in the before attribute name state. */
536 $this->state = 'afterAttributeName';
538 } elseif($char === '=') {
539 /* U+003D EQUALS SIGN (=)
540 Switch to the before attribute value state. */
541 $this->state = 'beforeAttributeValue';
543 } elseif($char === '>') {
544 /* U+003E GREATER-THAN SIGN (>)
545 Emit the current tag token. Switch to the data state. */
546 $this->emitToken($this->token);
547 $this->state = 'data';
549 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
550 /* U+002F SOLIDUS (/)
551 Parse error unless this is a permitted slash. Switch to the before
552 attribute name state. */
553 $this->state = 'beforeAttributeName';
555 } elseif($this->char === $this->EOF) {
556 /* EOF
557 Parse error. Emit the current tag token. Reconsume the EOF
558 character in the data state. */
559 $this->emitToken($this->token);
561 $this->char--;
562 $this->state = 'data';
564 } else {
565 /* Anything else
566 Append the current input character to the current attribute's name.
567 Stay in the attribute name state. */
568 $last = count($this->token['attr']) - 1;
569 $this->token['attr'][$last]['name'] .= strtolower($char);
571 $this->state = 'attributeName';
575 private function afterAttributeNameState() {
576 // Consume the next input character:
577 $this->char++;
578 $char = $this->character($this->char);
580 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
581 /* U+0009 CHARACTER TABULATION
582 U+000A LINE FEED (LF)
583 U+000B LINE TABULATION
584 U+000C FORM FEED (FF)
585 U+0020 SPACE
586 Stay in the after attribute name state. */
587 $this->state = 'afterAttributeName';
589 } elseif($char === '=') {
590 /* U+003D EQUALS SIGN (=)
591 Switch to the before attribute value state. */
592 $this->state = 'beforeAttributeValue';
594 } elseif($char === '>') {
595 /* U+003E GREATER-THAN SIGN (>)
596 Emit the current tag token. Switch to the data state. */
597 $this->emitToken($this->token);
598 $this->state = 'data';
600 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
601 /* U+002F SOLIDUS (/)
602 Parse error unless this is a permitted slash. Switch to the
603 before attribute name state. */
604 $this->state = 'beforeAttributeName';
606 } elseif($this->char === $this->EOF) {
607 /* EOF
608 Parse error. Emit the current tag token. Reconsume the EOF
609 character in the data state. */
610 $this->emitToken($this->token);
612 $this->char--;
613 $this->state = 'data';
615 } else {
616 /* Anything else
617 Start a new attribute in the current tag token. Set that attribute's
618 name to the current input character, and its value to the empty string.
619 Switch to the attribute name state. */
620 $this->token['attr'][] = array(
621 'name' => strtolower($char),
622 'value' => null
625 $this->state = 'attributeName';
629 private function beforeAttributeValueState() {
630 // Consume the next input character:
631 $this->char++;
632 $char = $this->character($this->char);
634 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
635 /* U+0009 CHARACTER TABULATION
636 U+000A LINE FEED (LF)
637 U+000B LINE TABULATION
638 U+000C FORM FEED (FF)
639 U+0020 SPACE
640 Stay in the before attribute value state. */
641 $this->state = 'beforeAttributeValue';
643 } elseif($char === '"') {
644 /* U+0022 QUOTATION MARK (")
645 Switch to the attribute value (double-quoted) state. */
646 $this->state = 'attributeValueDoubleQuoted';
648 } elseif($char === '&') {
649 /* U+0026 AMPERSAND (&)
650 Switch to the attribute value (unquoted) state and reconsume
651 this input character. */
652 $this->char--;
653 $this->state = 'attributeValueUnquoted';
655 } elseif($char === '\'') {
656 /* U+0027 APOSTROPHE (')
657 Switch to the attribute value (single-quoted) state. */
658 $this->state = 'attributeValueSingleQuoted';
660 } elseif($char === '>') {
661 /* U+003E GREATER-THAN SIGN (>)
662 Emit the current tag token. Switch to the data state. */
663 $this->emitToken($this->token);
664 $this->state = 'data';
666 } else {
667 /* Anything else
668 Append the current input character to the current attribute's value.
669 Switch to the attribute value (unquoted) state. */
670 $last = count($this->token['attr']) - 1;
671 $this->token['attr'][$last]['value'] .= $char;
673 $this->state = 'attributeValueUnquoted';
677 private function attributeValueDoubleQuotedState() {
678 // Consume the next input character:
679 $this->char++;
680 $char = $this->character($this->char);
682 if($char === '"') {
683 /* U+0022 QUOTATION MARK (")
684 Switch to the before attribute name state. */
685 $this->state = 'beforeAttributeName';
687 } elseif($char === '&') {
688 /* U+0026 AMPERSAND (&)
689 Switch to the entity in attribute value state. */
690 $this->entityInAttributeValueState('double');
692 } elseif($this->char === $this->EOF) {
693 /* EOF
694 Parse error. Emit the current tag token. Reconsume the character
695 in the data state. */
696 $this->emitToken($this->token);
698 $this->char--;
699 $this->state = 'data';
701 } else {
702 /* Anything else
703 Append the current input character to the current attribute's value.
704 Stay in the attribute value (double-quoted) state. */
705 $last = count($this->token['attr']) - 1;
706 $this->token['attr'][$last]['value'] .= $char;
708 $this->state = 'attributeValueDoubleQuoted';
712 private function attributeValueSingleQuotedState() {
713 // Consume the next input character:
714 $this->char++;
715 $char = $this->character($this->char);
717 if($char === '\'') {
718 /* U+0022 QUOTATION MARK (')
719 Switch to the before attribute name state. */
720 $this->state = 'beforeAttributeName';
722 } elseif($char === '&') {
723 /* U+0026 AMPERSAND (&)
724 Switch to the entity in attribute value state. */
725 $this->entityInAttributeValueState('single');
727 } elseif($this->char === $this->EOF) {
728 /* EOF
729 Parse error. Emit the current tag token. Reconsume the character
730 in the data state. */
731 $this->emitToken($this->token);
733 $this->char--;
734 $this->state = 'data';
736 } else {
737 /* Anything else
738 Append the current input character to the current attribute's value.
739 Stay in the attribute value (single-quoted) state. */
740 $last = count($this->token['attr']) - 1;
741 $this->token['attr'][$last]['value'] .= $char;
743 $this->state = 'attributeValueSingleQuoted';
747 private function attributeValueUnquotedState() {
748 // Consume the next input character:
749 $this->char++;
750 $char = $this->character($this->char);
752 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
753 /* U+0009 CHARACTER TABULATION
754 U+000A LINE FEED (LF)
755 U+000B LINE TABULATION
756 U+000C FORM FEED (FF)
757 U+0020 SPACE
758 Switch to the before attribute name state. */
759 $this->state = 'beforeAttributeName';
761 } elseif($char === '&') {
762 /* U+0026 AMPERSAND (&)
763 Switch to the entity in attribute value state. */
764 $this->entityInAttributeValueState();
766 } elseif($char === '>') {
767 /* U+003E GREATER-THAN SIGN (>)
768 Emit the current tag token. Switch to the data state. */
769 $this->emitToken($this->token);
770 $this->state = 'data';
772 } else {
773 /* Anything else
774 Append the current input character to the current attribute's value.
775 Stay in the attribute value (unquoted) state. */
776 $last = count($this->token['attr']) - 1;
777 $this->token['attr'][$last]['value'] .= $char;
779 $this->state = 'attributeValueUnquoted';
783 private function entityInAttributeValueState() {
784 // Attempt to consume an entity.
785 $entity = $this->entity();
787 // If nothing is returned, append a U+0026 AMPERSAND character to the
788 // current attribute's value. Otherwise, emit the character token that
789 // was returned.
790 $char = (!$entity)
791 ? '&'
792 : $entity;
794 $last = count($this->token['attr']) - 1;
795 $this->token['attr'][$last]['value'] .= $char;
798 private function bogusCommentState() {
799 /* Consume every character up to the first U+003E GREATER-THAN SIGN
800 character (>) or the end of the file (EOF), whichever comes first. Emit
801 a comment token whose data is the concatenation of all the characters
802 starting from and including the character that caused the state machine
803 to switch into the bogus comment state, up to and including the last
804 consumed character before the U+003E character, if any, or up to the
805 end of the file otherwise. (If the comment was started by the end of
806 the file (EOF), the token is empty.) */
807 $data = $this->characters('^>', $this->char);
808 $this->emitToken(array(
809 'data' => $data,
810 'type' => self::COMMENT
813 $this->char += strlen($data);
815 /* Switch to the data state. */
816 $this->state = 'data';
818 /* If the end of the file was reached, reconsume the EOF character. */
819 if($this->char === $this->EOF) {
820 $this->char = $this->EOF - 1;
824 private function markupDeclarationOpenState() {
825 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
826 characters, consume those two characters, create a comment token whose
827 data is the empty string, and switch to the comment state. */
828 if($this->character($this->char + 1, 2) === '--') {
829 $this->char += 2;
830 $this->state = 'comment';
831 $this->token = array(
832 'data' => null,
833 'type' => self::COMMENT
836 /* Otherwise if the next seven chacacters are a case-insensitive match
837 for the word "DOCTYPE", then consume those characters and switch to the
838 DOCTYPE state. */
839 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
840 $this->char += 7;
841 $this->state = 'doctype';
843 /* Otherwise, is is a parse error. Switch to the bogus comment state.
844 The next character that is consumed, if any, is the first character
845 that will be in the comment. */
846 } else {
847 $this->char++;
848 $this->state = 'bogusComment';
852 private function commentState() {
853 /* Consume the next input character: */
854 $this->char++;
855 $char = $this->char();
857 /* U+002D HYPHEN-MINUS (-) */
858 if($char === '-') {
859 /* Switch to the comment dash state */
860 $this->state = 'commentDash';
862 /* EOF */
863 } elseif($this->char === $this->EOF) {
864 /* Parse error. Emit the comment token. Reconsume the EOF character
865 in the data state. */
866 $this->emitToken($this->token);
867 $this->char--;
868 $this->state = 'data';
870 /* Anything else */
871 } else {
872 /* Append the input character to the comment token's data. Stay in
873 the comment state. */
874 $this->token['data'] .= $char;
878 private function commentDashState() {
879 /* Consume the next input character: */
880 $this->char++;
881 $char = $this->char();
883 /* U+002D HYPHEN-MINUS (-) */
884 if($char === '-') {
885 /* Switch to the comment end state */
886 $this->state = 'commentEnd';
888 /* EOF */
889 } elseif($this->char === $this->EOF) {
890 /* Parse error. Emit the comment token. Reconsume the EOF character
891 in the data state. */
892 $this->emitToken($this->token);
893 $this->char--;
894 $this->state = 'data';
896 /* Anything else */
897 } else {
898 /* Append a U+002D HYPHEN-MINUS (-) character and the input
899 character to the comment token's data. Switch to the comment state. */
900 $this->token['data'] .= '-'.$char;
901 $this->state = 'comment';
905 private function commentEndState() {
906 /* Consume the next input character: */
907 $this->char++;
908 $char = $this->char();
910 if($char === '>') {
911 $this->emitToken($this->token);
912 $this->state = 'data';
914 } elseif($char === '-') {
915 $this->token['data'] .= '-';
917 } elseif($this->char === $this->EOF) {
918 $this->emitToken($this->token);
919 $this->char--;
920 $this->state = 'data';
922 } else {
923 $this->token['data'] .= '--'.$char;
924 $this->state = 'comment';
928 private function doctypeState() {
929 /* Consume the next input character: */
930 $this->char++;
931 $char = $this->char();
933 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
934 $this->state = 'beforeDoctypeName';
936 } else {
937 $this->char--;
938 $this->state = 'beforeDoctypeName';
942 private function beforeDoctypeNameState() {
943 /* Consume the next input character: */
944 $this->char++;
945 $char = $this->char();
947 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
948 // Stay in the before DOCTYPE name state.
950 } elseif(preg_match('/^[a-z]$/', $char)) {
951 $this->token = array(
952 'name' => strtoupper($char),
953 'type' => self::DOCTYPE,
954 'error' => true
957 $this->state = 'doctypeName';
959 } elseif($char === '>') {
960 $this->emitToken(array(
961 'name' => null,
962 'type' => self::DOCTYPE,
963 'error' => true
966 $this->state = 'data';
968 } elseif($this->char === $this->EOF) {
969 $this->emitToken(array(
970 'name' => null,
971 'type' => self::DOCTYPE,
972 'error' => true
975 $this->char--;
976 $this->state = 'data';
978 } else {
979 $this->token = array(
980 'name' => $char,
981 'type' => self::DOCTYPE,
982 'error' => true
985 $this->state = 'doctypeName';
989 private function doctypeNameState() {
990 /* Consume the next input character: */
991 $this->char++;
992 $char = $this->char();
994 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
995 $this->state = 'AfterDoctypeName';
997 } elseif($char === '>') {
998 $this->emitToken($this->token);
999 $this->state = 'data';
1001 } elseif(preg_match('/^[a-z]$/', $char)) {
1002 $this->token['name'] .= strtoupper($char);
1004 } elseif($this->char === $this->EOF) {
1005 $this->emitToken($this->token);
1006 $this->char--;
1007 $this->state = 'data';
1009 } else {
1010 $this->token['name'] .= $char;
1013 $this->token['error'] = ($this->token['name'] === 'HTML')
1014 ? false
1015 : true;
1018 private function afterDoctypeNameState() {
1019 /* Consume the next input character: */
1020 $this->char++;
1021 $char = $this->char();
1023 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1024 // Stay in the DOCTYPE name state.
1026 } elseif($char === '>') {
1027 $this->emitToken($this->token);
1028 $this->state = 'data';
1030 } elseif($this->char === $this->EOF) {
1031 $this->emitToken($this->token);
1032 $this->char--;
1033 $this->state = 'data';
1035 } else {
1036 $this->token['error'] = true;
1037 $this->state = 'bogusDoctype';
1041 private function bogusDoctypeState() {
1042 /* Consume the next input character: */
1043 $this->char++;
1044 $char = $this->char();
1046 if($char === '>') {
1047 $this->emitToken($this->token);
1048 $this->state = 'data';
1050 } elseif($this->char === $this->EOF) {
1051 $this->emitToken($this->token);
1052 $this->char--;
1053 $this->state = 'data';
1055 } else {
1056 // Stay in the bogus DOCTYPE state.
1060 private function entity() {
1061 $start = $this->char;
1063 // This section defines how to consume an entity. This definition is
1064 // used when parsing entities in text and in attributes.
1066 // The behaviour depends on the identity of the next character (the
1067 // one immediately after the U+0026 AMPERSAND character):
1069 switch($this->character($this->char + 1)) {
1070 // U+0023 NUMBER SIGN (#)
1071 case '#':
1073 // The behaviour further depends on the character after the
1074 // U+0023 NUMBER SIGN:
1075 switch($this->character($this->char + 1)) {
1076 // U+0078 LATIN SMALL LETTER X
1077 // U+0058 LATIN CAPITAL LETTER X
1078 case 'x':
1079 case 'X':
1080 // Follow the steps below, but using the range of
1081 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1082 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1083 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1084 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1085 // words, 0-9, A-F, a-f).
1086 $char = 1;
1087 $char_class = '0-9A-Fa-f';
1088 break;
1090 // Anything else
1091 default:
1092 // Follow the steps below, but using the range of
1093 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1094 // NINE (i.e. just 0-9).
1095 $char = 0;
1096 $char_class = '0-9';
1097 break;
1100 // Consume as many characters as match the range of characters
1101 // given above.
1102 $this->char++;
1103 $e_name = $this->characters($char_class, $this->char + $char + 1);
1104 $entity = $this->character($start, $this->char);
1105 $cond = strlen($e_name) > 0;
1107 // The rest of the parsing happens bellow.
1108 break;
1110 // Anything else
1111 default:
1112 // Consume the maximum number of characters possible, with the
1113 // consumed characters case-sensitively matching one of the
1114 // identifiers in the first column of the entities table.
1115 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1116 $len = strlen($e_name);
1118 for($c = 1; $c <= $len; $c++) {
1119 $id = substr($e_name, 0, $c);
1120 $this->char++;
1122 if(in_array($id, $this->entities)) {
1123 if ($e_name[$c-1] !== ';') {
1124 if ($c < $len && $e_name[$c] == ';') {
1125 $this->char++; // consume extra semicolon
1128 $entity = $id;
1129 break;
1133 $cond = isset($entity);
1134 // The rest of the parsing happens bellow.
1135 break;
1138 if(!$cond) {
1139 // If no match can be made, then this is a parse error. No
1140 // characters are consumed, and nothing is returned.
1141 $this->char = $start;
1142 return false;
1145 // Return a character token for the character corresponding to the
1146 // entity name (as given by the second column of the entities table).
1147 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1150 private function emitToken($token) {
1151 $emit = $this->tree->emitToken($token);
1153 if(is_int($emit)) {
1154 $this->content_model = $emit;
1156 } elseif($token['type'] === self::ENDTAG) {
1157 $this->content_model = self::PCDATA;
1161 private function EOF() {
1162 $this->state = null;
1163 $this->tree->emitToken(array(
1164 'type' => self::EOF
1169 class HTML5TreeConstructer {
1170 public $stack = array();
1172 private $phase;
1173 private $mode;
1174 private $dom;
1175 private $foster_parent = null;
1176 private $a_formatting = array();
1178 private $head_pointer = null;
1179 private $form_pointer = null;
1181 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1182 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1183 private $special = array('address','area','base','basefont','bgsound',
1184 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1185 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1186 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1187 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1188 'option','p','param','plaintext','pre','script','select','spacer','style',
1189 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1191 // The different phases.
1192 const INIT_PHASE = 0;
1193 const ROOT_PHASE = 1;
1194 const MAIN_PHASE = 2;
1195 const END_PHASE = 3;
1197 // The different insertion modes for the main phase.
1198 const BEFOR_HEAD = 0;
1199 const IN_HEAD = 1;
1200 const AFTER_HEAD = 2;
1201 const IN_BODY = 3;
1202 const IN_TABLE = 4;
1203 const IN_CAPTION = 5;
1204 const IN_CGROUP = 6;
1205 const IN_TBODY = 7;
1206 const IN_ROW = 8;
1207 const IN_CELL = 9;
1208 const IN_SELECT = 10;
1209 const AFTER_BODY = 11;
1210 const IN_FRAME = 12;
1211 const AFTR_FRAME = 13;
1213 // The different types of elements.
1214 const SPECIAL = 0;
1215 const SCOPING = 1;
1216 const FORMATTING = 2;
1217 const PHRASING = 3;
1219 const MARKER = 0;
1221 public function __construct() {
1222 $this->phase = self::INIT_PHASE;
1223 $this->mode = self::BEFOR_HEAD;
1224 $this->dom = new DOMDocument;
1226 $this->dom->encoding = 'UTF-8';
1227 $this->dom->preserveWhiteSpace = true;
1228 $this->dom->substituteEntities = true;
1229 $this->dom->strictErrorChecking = false;
1232 // Process tag tokens
1233 public function emitToken($token) {
1234 switch($this->phase) {
1235 case self::INIT_PHASE: return $this->initPhase($token); break;
1236 case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1237 case self::MAIN_PHASE: return $this->mainPhase($token); break;
1238 case self::END_PHASE : return $this->trailingEndPhase($token); break;
1242 private function initPhase($token) {
1243 /* Initially, the tree construction stage must handle each token
1244 emitted from the tokenisation stage as follows: */
1246 /* A DOCTYPE token that is marked as being in error
1247 A comment token
1248 A start tag token
1249 An end tag token
1250 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1251 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1252 or U+0020 SPACE
1253 An end-of-file token */
1254 if((isset($token['error']) && $token['error']) ||
1255 $token['type'] === HTML5::COMMENT ||
1256 $token['type'] === HTML5::STARTTAG ||
1257 $token['type'] === HTML5::ENDTAG ||
1258 $token['type'] === HTML5::EOF ||
1259 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1260 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1261 /* This specification does not define how to handle this case. In
1262 particular, user agents may ignore the entirety of this specification
1263 altogether for such documents, and instead invoke special parse modes
1264 with a greater emphasis on backwards compatibility. */
1266 $this->phase = self::ROOT_PHASE;
1267 return $this->rootElementPhase($token);
1269 /* A DOCTYPE token marked as being correct */
1270 } elseif(isset($token['error']) && !$token['error']) {
1271 /* Append a DocumentType node to the Document node, with the name
1272 attribute set to the name given in the DOCTYPE token (which will be
1273 "HTML"), and the other attributes specific to DocumentType objects
1274 set to null, empty lists, or the empty string as appropriate. */
1275 $doctype = new DOMDocumentType(null, null, 'HTML');
1277 /* Then, switch to the root element phase of the tree construction
1278 stage. */
1279 $this->phase = self::ROOT_PHASE;
1281 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1282 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1283 or U+0020 SPACE */
1284 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1285 $token['data'])) {
1286 /* Append that character to the Document node. */
1287 $text = $this->dom->createTextNode($token['data']);
1288 $this->dom->appendChild($text);
1292 private function rootElementPhase($token) {
1293 /* After the initial phase, as each token is emitted from the tokenisation
1294 stage, it must be processed as described in this section. */
1296 /* A DOCTYPE token */
1297 if($token['type'] === HTML5::DOCTYPE) {
1298 // Parse error. Ignore the token.
1300 /* A comment token */
1301 } elseif($token['type'] === HTML5::COMMENT) {
1302 /* Append a Comment node to the Document object with the data
1303 attribute set to the data given in the comment token. */
1304 $comment = $this->dom->createComment($token['data']);
1305 $this->dom->appendChild($comment);
1307 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1308 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1309 or U+0020 SPACE */
1310 } elseif($token['type'] === HTML5::CHARACTR &&
1311 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1312 /* Append that character to the Document node. */
1313 $text = $this->dom->createTextNode($token['data']);
1314 $this->dom->appendChild($text);
1316 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1317 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1318 (FF), or U+0020 SPACE
1319 A start tag token
1320 An end tag token
1321 An end-of-file token */
1322 } elseif(($token['type'] === HTML5::CHARACTR &&
1323 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1324 $token['type'] === HTML5::STARTTAG ||
1325 $token['type'] === HTML5::ENDTAG ||
1326 $token['type'] === HTML5::EOF) {
1327 /* Create an HTMLElement node with the tag name html, in the HTML
1328 namespace. Append it to the Document object. Switch to the main
1329 phase and reprocess the current token. */
1330 $html = $this->dom->createElement('html');
1331 $this->dom->appendChild($html);
1332 $this->stack[] = $html;
1334 $this->phase = self::MAIN_PHASE;
1335 return $this->mainPhase($token);
1339 private function mainPhase($token) {
1340 /* Tokens in the main phase must be handled as follows: */
1342 /* A DOCTYPE token */
1343 if($token['type'] === HTML5::DOCTYPE) {
1344 // Parse error. Ignore the token.
1346 /* A start tag token with the tag name "html" */
1347 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1348 /* If this start tag token was not the first start tag token, then
1349 it is a parse error. */
1351 /* For each attribute on the token, check to see if the attribute
1352 is already present on the top element of the stack of open elements.
1353 If it is not, add the attribute and its corresponding value to that
1354 element. */
1355 foreach($token['attr'] as $attr) {
1356 if(!$this->stack[0]->hasAttribute($attr['name'])) {
1357 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1361 /* An end-of-file token */
1362 } elseif($token['type'] === HTML5::EOF) {
1363 /* Generate implied end tags. */
1364 $this->generateImpliedEndTags();
1366 /* Anything else. */
1367 } else {
1368 /* Depends on the insertion mode: */
1369 switch($this->mode) {
1370 case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1371 case self::IN_HEAD: return $this->inHead($token); break;
1372 case self::AFTER_HEAD: return $this->afterHead($token); break;
1373 case self::IN_BODY: return $this->inBody($token); break;
1374 case self::IN_TABLE: return $this->inTable($token); break;
1375 case self::IN_CAPTION: return $this->inCaption($token); break;
1376 case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1377 case self::IN_TBODY: return $this->inTableBody($token); break;
1378 case self::IN_ROW: return $this->inRow($token); break;
1379 case self::IN_CELL: return $this->inCell($token); break;
1380 case self::IN_SELECT: return $this->inSelect($token); break;
1381 case self::AFTER_BODY: return $this->afterBody($token); break;
1382 case self::IN_FRAME: return $this->inFrameset($token); break;
1383 case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1384 case self::END_PHASE: return $this->trailingEndPhase($token); break;
1389 private function beforeHead($token) {
1390 /* Handle the token as follows: */
1392 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1393 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1394 or U+0020 SPACE */
1395 if($token['type'] === HTML5::CHARACTR &&
1396 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1397 /* Append the character to the current node. */
1398 $this->insertText($token['data']);
1400 /* A comment token */
1401 } elseif($token['type'] === HTML5::COMMENT) {
1402 /* Append a Comment node to the current node with the data attribute
1403 set to the data given in the comment token. */
1404 $this->insertComment($token['data']);
1406 /* A start tag token with the tag name "head" */
1407 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1408 /* Create an element for the token, append the new element to the
1409 current node and push it onto the stack of open elements. */
1410 $element = $this->insertElement($token);
1412 /* Set the head element pointer to this new element node. */
1413 $this->head_pointer = $element;
1415 /* Change the insertion mode to "in head". */
1416 $this->mode = self::IN_HEAD;
1418 /* A start tag token whose tag name is one of: "base", "link", "meta",
1419 "script", "style", "title". Or an end tag with the tag name "html".
1420 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1421 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1422 or U+0020 SPACE. Or any other start tag token */
1423 } elseif($token['type'] === HTML5::STARTTAG ||
1424 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1425 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1426 $token['data']))) {
1427 /* Act as if a start tag token with the tag name "head" and no
1428 attributes had been seen, then reprocess the current token. */
1429 $this->beforeHead(array(
1430 'name' => 'head',
1431 'type' => HTML5::STARTTAG,
1432 'attr' => array()
1435 return $this->inHead($token);
1437 /* Any other end tag */
1438 } elseif($token['type'] === HTML5::ENDTAG) {
1439 /* Parse error. Ignore the token. */
1443 private function inHead($token) {
1444 /* Handle the token as follows: */
1446 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1447 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1448 or U+0020 SPACE.
1450 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1451 or script element, append the character to the current node regardless
1452 of its content. */
1453 if(($token['type'] === HTML5::CHARACTR &&
1454 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1455 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1456 array('title', 'style', 'script')))) {
1457 /* Append the character to the current node. */
1458 $this->insertText($token['data']);
1460 /* A comment token */
1461 } elseif($token['type'] === HTML5::COMMENT) {
1462 /* Append a Comment node to the current node with the data attribute
1463 set to the data given in the comment token. */
1464 $this->insertComment($token['data']);
1466 } elseif($token['type'] === HTML5::ENDTAG &&
1467 in_array($token['name'], array('title', 'style', 'script'))) {
1468 array_pop($this->stack);
1469 return HTML5::PCDATA;
1471 /* A start tag with the tag name "title" */
1472 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1473 /* Create an element for the token and append the new element to the
1474 node pointed to by the head element pointer, or, if that is null
1475 (innerHTML case), to the current node. */
1476 if($this->head_pointer !== null) {
1477 $element = $this->insertElement($token, false);
1478 $this->head_pointer->appendChild($element);
1480 } else {
1481 $element = $this->insertElement($token);
1484 /* Switch the tokeniser's content model flag to the RCDATA state. */
1485 return HTML5::RCDATA;
1487 /* A start tag with the tag name "style" */
1488 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1489 /* Create an element for the token and append the new element to the
1490 node pointed to by the head element pointer, or, if that is null
1491 (innerHTML case), to the current node. */
1492 if($this->head_pointer !== null) {
1493 $element = $this->insertElement($token, false);
1494 $this->head_pointer->appendChild($element);
1496 } else {
1497 $this->insertElement($token);
1500 /* Switch the tokeniser's content model flag to the CDATA state. */
1501 return HTML5::CDATA;
1503 /* A start tag with the tag name "script" */
1504 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1505 /* Create an element for the token. */
1506 $element = $this->insertElement($token, false);
1507 $this->head_pointer->appendChild($element);
1509 /* Switch the tokeniser's content model flag to the CDATA state. */
1510 return HTML5::CDATA;
1512 /* A start tag with the tag name "base", "link", or "meta" */
1513 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1514 array('base', 'link', 'meta'))) {
1515 /* Create an element for the token and append the new element to the
1516 node pointed to by the head element pointer, or, if that is null
1517 (innerHTML case), to the current node. */
1518 if($this->head_pointer !== null) {
1519 $element = $this->insertElement($token, false);
1520 $this->head_pointer->appendChild($element);
1521 array_pop($this->stack);
1523 } else {
1524 $this->insertElement($token);
1527 /* An end tag with the tag name "head" */
1528 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1529 /* If the current node is a head element, pop the current node off
1530 the stack of open elements. */
1531 if($this->head_pointer->isSameNode(end($this->stack))) {
1532 array_pop($this->stack);
1534 /* Otherwise, this is a parse error. */
1535 } else {
1536 // k
1539 /* Change the insertion mode to "after head". */
1540 $this->mode = self::AFTER_HEAD;
1542 /* A start tag with the tag name "head" or an end tag except "html". */
1543 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1544 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1545 // Parse error. Ignore the token.
1547 /* Anything else */
1548 } else {
1549 /* If the current node is a head element, act as if an end tag
1550 token with the tag name "head" had been seen. */
1551 if($this->head_pointer->isSameNode(end($this->stack))) {
1552 $this->inHead(array(
1553 'name' => 'head',
1554 'type' => HTML5::ENDTAG
1557 /* Otherwise, change the insertion mode to "after head". */
1558 } else {
1559 $this->mode = self::AFTER_HEAD;
1562 /* Then, reprocess the current token. */
1563 return $this->afterHead($token);
1567 private function afterHead($token) {
1568 /* Handle the token as follows: */
1570 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1571 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1572 or U+0020 SPACE */
1573 if($token['type'] === HTML5::CHARACTR &&
1574 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1575 /* Append the character to the current node. */
1576 $this->insertText($token['data']);
1578 /* A comment token */
1579 } elseif($token['type'] === HTML5::COMMENT) {
1580 /* Append a Comment node to the current node with the data attribute
1581 set to the data given in the comment token. */
1582 $this->insertComment($token['data']);
1584 /* A start tag token with the tag name "body" */
1585 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1586 /* Insert a body element for the token. */
1587 $this->insertElement($token);
1589 /* Change the insertion mode to "in body". */
1590 $this->mode = self::IN_BODY;
1592 /* A start tag token with the tag name "frameset" */
1593 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1594 /* Insert a frameset element for the token. */
1595 $this->insertElement($token);
1597 /* Change the insertion mode to "in frameset". */
1598 $this->mode = self::IN_FRAME;
1600 /* A start tag token whose tag name is one of: "base", "link", "meta",
1601 "script", "style", "title" */
1602 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1603 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1604 /* Parse error. Switch the insertion mode back to "in head" and
1605 reprocess the token. */
1606 $this->mode = self::IN_HEAD;
1607 return $this->inHead($token);
1609 /* Anything else */
1610 } else {
1611 /* Act as if a start tag token with the tag name "body" and no
1612 attributes had been seen, and then reprocess the current token. */
1613 $this->afterHead(array(
1614 'name' => 'body',
1615 'type' => HTML5::STARTTAG,
1616 'attr' => array()
1619 return $this->inBody($token);
1623 private function inBody($token) {
1624 /* Handle the token as follows: */
1626 switch($token['type']) {
1627 /* A character token */
1628 case HTML5::CHARACTR:
1629 /* Reconstruct the active formatting elements, if any. */
1630 $this->reconstructActiveFormattingElements();
1632 /* Append the token's character to the current node. */
1633 $this->insertText($token['data']);
1634 break;
1636 /* A comment token */
1637 case HTML5::COMMENT:
1638 /* Append a Comment node to the current node with the data
1639 attribute set to the data given in the comment token. */
1640 $this->insertComment($token['data']);
1641 break;
1643 case HTML5::STARTTAG:
1644 switch($token['name']) {
1645 /* A start tag token whose tag name is one of: "script",
1646 "style" */
1647 case 'script': case 'style':
1648 /* Process the token as if the insertion mode had been "in
1649 head". */
1650 return $this->inHead($token);
1651 break;
1653 /* A start tag token whose tag name is one of: "base", "link",
1654 "meta", "title" */
1655 case 'base': case 'link': case 'meta': case 'title':
1656 /* Parse error. Process the token as if the insertion mode
1657 had been "in head". */
1658 return $this->inHead($token);
1659 break;
1661 /* A start tag token with the tag name "body" */
1662 case 'body':
1663 /* Parse error. If the second element on the stack of open
1664 elements is not a body element, or, if the stack of open
1665 elements has only one node on it, then ignore the token.
1666 (innerHTML case) */
1667 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1668 // Ignore
1670 /* Otherwise, for each attribute on the token, check to see
1671 if the attribute is already present on the body element (the
1672 second element) on the stack of open elements. If it is not,
1673 add the attribute and its corresponding value to that
1674 element. */
1675 } else {
1676 foreach($token['attr'] as $attr) {
1677 if(!$this->stack[1]->hasAttribute($attr['name'])) {
1678 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1682 break;
1684 /* A start tag whose tag name is one of: "address",
1685 "blockquote", "center", "dir", "div", "dl", "fieldset",
1686 "listing", "menu", "ol", "p", "ul" */
1687 case 'address': case 'blockquote': case 'center': case 'dir':
1688 case 'div': case 'dl': case 'fieldset': case 'listing':
1689 case 'menu': case 'ol': case 'p': case 'ul':
1690 /* If the stack of open elements has a p element in scope,
1691 then act as if an end tag with the tag name p had been
1692 seen. */
1693 if($this->elementInScope('p')) {
1694 $this->emitToken(array(
1695 'name' => 'p',
1696 'type' => HTML5::ENDTAG
1700 /* Insert an HTML element for the token. */
1701 $this->insertElement($token);
1702 break;
1704 /* A start tag whose tag name is "form" */
1705 case 'form':
1706 /* If the form element pointer is not null, ignore the
1707 token with a parse error. */
1708 if($this->form_pointer !== null) {
1709 // Ignore.
1711 /* Otherwise: */
1712 } else {
1713 /* If the stack of open elements has a p element in
1714 scope, then act as if an end tag with the tag name p
1715 had been seen. */
1716 if($this->elementInScope('p')) {
1717 $this->emitToken(array(
1718 'name' => 'p',
1719 'type' => HTML5::ENDTAG
1723 /* Insert an HTML element for the token, and set the
1724 form element pointer to point to the element created. */
1725 $element = $this->insertElement($token);
1726 $this->form_pointer = $element;
1728 break;
1730 /* A start tag whose tag name is "li", "dd" or "dt" */
1731 case 'li': case 'dd': case 'dt':
1732 /* If the stack of open elements has a p element in scope,
1733 then act as if an end tag with the tag name p had been
1734 seen. */
1735 if($this->elementInScope('p')) {
1736 $this->emitToken(array(
1737 'name' => 'p',
1738 'type' => HTML5::ENDTAG
1742 $stack_length = count($this->stack) - 1;
1744 for($n = $stack_length; 0 <= $n; $n--) {
1745 /* 1. Initialise node to be the current node (the
1746 bottommost node of the stack). */
1747 $stop = false;
1748 $node = $this->stack[$n];
1749 $cat = $this->getElementCategory($node->tagName);
1751 /* 2. If node is an li, dd or dt element, then pop all
1752 the nodes from the current node up to node, including
1753 node, then stop this algorithm. */
1754 if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1755 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1756 for($x = $stack_length; $x >= $n ; $x--) {
1757 array_pop($this->stack);
1760 break;
1763 /* 3. If node is not in the formatting category, and is
1764 not in the phrasing category, and is not an address or
1765 div element, then stop this algorithm. */
1766 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1767 $node->tagName !== 'address' && $node->tagName !== 'div') {
1768 break;
1772 /* Finally, insert an HTML element with the same tag
1773 name as the token's. */
1774 $this->insertElement($token);
1775 break;
1777 /* A start tag token whose tag name is "plaintext" */
1778 case 'plaintext':
1779 /* If the stack of open elements has a p element in scope,
1780 then act as if an end tag with the tag name p had been
1781 seen. */
1782 if($this->elementInScope('p')) {
1783 $this->emitToken(array(
1784 'name' => 'p',
1785 'type' => HTML5::ENDTAG
1789 /* Insert an HTML element for the token. */
1790 $this->insertElement($token);
1792 return HTML5::PLAINTEXT;
1793 break;
1795 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1796 "h5", "h6" */
1797 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1798 /* If the stack of open elements has a p element in scope,
1799 then act as if an end tag with the tag name p had been seen. */
1800 if($this->elementInScope('p')) {
1801 $this->emitToken(array(
1802 'name' => 'p',
1803 'type' => HTML5::ENDTAG
1807 /* If the stack of open elements has in scope an element whose
1808 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1809 this is a parse error; pop elements from the stack until an
1810 element with one of those tag names has been popped from the
1811 stack. */
1812 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1813 array_pop($this->stack);
1816 /* Insert an HTML element for the token. */
1817 $this->insertElement($token);
1818 break;
1820 /* A start tag whose tag name is "a" */
1821 case 'a':
1822 /* If the list of active formatting elements contains
1823 an element whose tag name is "a" between the end of the
1824 list and the last marker on the list (or the start of
1825 the list if there is no marker on the list), then this
1826 is a parse error; act as if an end tag with the tag name
1827 "a" had been seen, then remove that element from the list
1828 of active formatting elements and the stack of open
1829 elements if the end tag didn't already remove it (it
1830 might not have if the element is not in table scope). */
1831 $leng = count($this->a_formatting);
1833 for($n = $leng - 1; $n >= 0; $n--) {
1834 if($this->a_formatting[$n] === self::MARKER) {
1835 break;
1837 } elseif($this->a_formatting[$n]->nodeName === 'a') {
1838 $this->emitToken(array(
1839 'name' => 'a',
1840 'type' => HTML5::ENDTAG
1842 break;
1846 /* Reconstruct the active formatting elements, if any. */
1847 $this->reconstructActiveFormattingElements();
1849 /* Insert an HTML element for the token. */
1850 $el = $this->insertElement($token);
1852 /* Add that element to the list of active formatting
1853 elements. */
1854 $this->a_formatting[] = $el;
1855 break;
1857 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1858 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1859 case 'b': case 'big': case 'em': case 'font': case 'i':
1860 case 'nobr': case 's': case 'small': case 'strike':
1861 case 'strong': case 'tt': case 'u':
1862 /* Reconstruct the active formatting elements, if any. */
1863 $this->reconstructActiveFormattingElements();
1865 /* Insert an HTML element for the token. */
1866 $el = $this->insertElement($token);
1868 /* Add that element to the list of active formatting
1869 elements. */
1870 $this->a_formatting[] = $el;
1871 break;
1873 /* A start tag token whose tag name is "button" */
1874 case 'button':
1875 /* If the stack of open elements has a button element in scope,
1876 then this is a parse error; act as if an end tag with the tag
1877 name "button" had been seen, then reprocess the token. (We don't
1878 do that. Unnecessary.) */
1879 if($this->elementInScope('button')) {
1880 $this->inBody(array(
1881 'name' => 'button',
1882 'type' => HTML5::ENDTAG
1886 /* Reconstruct the active formatting elements, if any. */
1887 $this->reconstructActiveFormattingElements();
1889 /* Insert an HTML element for the token. */
1890 $this->insertElement($token);
1892 /* Insert a marker at the end of the list of active
1893 formatting elements. */
1894 $this->a_formatting[] = self::MARKER;
1895 break;
1897 /* A start tag token whose tag name is one of: "marquee", "object" */
1898 case 'marquee': case 'object':
1899 /* Reconstruct the active formatting elements, if any. */
1900 $this->reconstructActiveFormattingElements();
1902 /* Insert an HTML element for the token. */
1903 $this->insertElement($token);
1905 /* Insert a marker at the end of the list of active
1906 formatting elements. */
1907 $this->a_formatting[] = self::MARKER;
1908 break;
1910 /* A start tag token whose tag name is "xmp" */
1911 case 'xmp':
1912 /* Reconstruct the active formatting elements, if any. */
1913 $this->reconstructActiveFormattingElements();
1915 /* Insert an HTML element for the token. */
1916 $this->insertElement($token);
1918 /* Switch the content model flag to the CDATA state. */
1919 return HTML5::CDATA;
1920 break;
1922 /* A start tag whose tag name is "table" */
1923 case 'table':
1924 /* If the stack of open elements has a p element in scope,
1925 then act as if an end tag with the tag name p had been seen. */
1926 if($this->elementInScope('p')) {
1927 $this->emitToken(array(
1928 'name' => 'p',
1929 'type' => HTML5::ENDTAG
1933 /* Insert an HTML element for the token. */
1934 $this->insertElement($token);
1936 /* Change the insertion mode to "in table". */
1937 $this->mode = self::IN_TABLE;
1938 break;
1940 /* A start tag whose tag name is one of: "area", "basefont",
1941 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1942 case 'area': case 'basefont': case 'bgsound': case 'br':
1943 case 'embed': case 'img': case 'param': case 'spacer':
1944 case 'wbr':
1945 /* Reconstruct the active formatting elements, if any. */
1946 $this->reconstructActiveFormattingElements();
1948 /* Insert an HTML element for the token. */
1949 $this->insertElement($token);
1951 /* Immediately pop the current node off the stack of open elements. */
1952 array_pop($this->stack);
1953 break;
1955 /* A start tag whose tag name is "hr" */
1956 case 'hr':
1957 /* If the stack of open elements has a p element in scope,
1958 then act as if an end tag with the tag name p had been seen. */
1959 if($this->elementInScope('p')) {
1960 $this->emitToken(array(
1961 'name' => 'p',
1962 'type' => HTML5::ENDTAG
1966 /* Insert an HTML element for the token. */
1967 $this->insertElement($token);
1969 /* Immediately pop the current node off the stack of open elements. */
1970 array_pop($this->stack);
1971 break;
1973 /* A start tag whose tag name is "image" */
1974 case 'image':
1975 /* Parse error. Change the token's tag name to "img" and
1976 reprocess it. (Don't ask.) */
1977 $token['name'] = 'img';
1978 return $this->inBody($token);
1979 break;
1981 /* A start tag whose tag name is "input" */
1982 case 'input':
1983 /* Reconstruct the active formatting elements, if any. */
1984 $this->reconstructActiveFormattingElements();
1986 /* Insert an input element for the token. */
1987 $element = $this->insertElement($token, false);
1989 /* If the form element pointer is not null, then associate the
1990 input element with the form element pointed to by the form
1991 element pointer. */
1992 $this->form_pointer !== null
1993 ? $this->form_pointer->appendChild($element)
1994 : end($this->stack)->appendChild($element);
1996 /* Pop that input element off the stack of open elements. */
1997 array_pop($this->stack);
1998 break;
2000 /* A start tag whose tag name is "isindex" */
2001 case 'isindex':
2002 /* Parse error. */
2003 // w/e
2005 /* If the form element pointer is not null,
2006 then ignore the token. */
2007 if($this->form_pointer === null) {
2008 /* Act as if a start tag token with the tag name "form" had
2009 been seen. */
2010 $this->inBody(array(
2011 'name' => 'body',
2012 'type' => HTML5::STARTTAG,
2013 'attr' => array()
2016 /* Act as if a start tag token with the tag name "hr" had
2017 been seen. */
2018 $this->inBody(array(
2019 'name' => 'hr',
2020 'type' => HTML5::STARTTAG,
2021 'attr' => array()
2024 /* Act as if a start tag token with the tag name "p" had
2025 been seen. */
2026 $this->inBody(array(
2027 'name' => 'p',
2028 'type' => HTML5::STARTTAG,
2029 'attr' => array()
2032 /* Act as if a start tag token with the tag name "label"
2033 had been seen. */
2034 $this->inBody(array(
2035 'name' => 'label',
2036 'type' => HTML5::STARTTAG,
2037 'attr' => array()
2040 /* Act as if a stream of character tokens had been seen. */
2041 $this->insertText('This is a searchable index. '.
2042 'Insert your search keywords here: ');
2044 /* Act as if a start tag token with the tag name "input"
2045 had been seen, with all the attributes from the "isindex"
2046 token, except with the "name" attribute set to the value
2047 "isindex" (ignoring any explicit "name" attribute). */
2048 $attr = $token['attr'];
2049 $attr[] = array('name' => 'name', 'value' => 'isindex');
2051 $this->inBody(array(
2052 'name' => 'input',
2053 'type' => HTML5::STARTTAG,
2054 'attr' => $attr
2057 /* Act as if a stream of character tokens had been seen
2058 (see below for what they should say). */
2059 $this->insertText('This is a searchable index. '.
2060 'Insert your search keywords here: ');
2062 /* Act as if an end tag token with the tag name "label"
2063 had been seen. */
2064 $this->inBody(array(
2065 'name' => 'label',
2066 'type' => HTML5::ENDTAG
2069 /* Act as if an end tag token with the tag name "p" had
2070 been seen. */
2071 $this->inBody(array(
2072 'name' => 'p',
2073 'type' => HTML5::ENDTAG
2076 /* Act as if a start tag token with the tag name "hr" had
2077 been seen. */
2078 $this->inBody(array(
2079 'name' => 'hr',
2080 'type' => HTML5::ENDTAG
2083 /* Act as if an end tag token with the tag name "form" had
2084 been seen. */
2085 $this->inBody(array(
2086 'name' => 'form',
2087 'type' => HTML5::ENDTAG
2090 break;
2092 /* A start tag whose tag name is "textarea" */
2093 case 'textarea':
2094 $this->insertElement($token);
2096 /* Switch the tokeniser's content model flag to the
2097 RCDATA state. */
2098 return HTML5::RCDATA;
2099 break;
2101 /* A start tag whose tag name is one of: "iframe", "noembed",
2102 "noframes" */
2103 case 'iframe': case 'noembed': case 'noframes':
2104 $this->insertElement($token);
2106 /* Switch the tokeniser's content model flag to the CDATA state. */
2107 return HTML5::CDATA;
2108 break;
2110 /* A start tag whose tag name is "select" */
2111 case 'select':
2112 /* Reconstruct the active formatting elements, if any. */
2113 $this->reconstructActiveFormattingElements();
2115 /* Insert an HTML element for the token. */
2116 $this->insertElement($token);
2118 /* Change the insertion mode to "in select". */
2119 $this->mode = self::IN_SELECT;
2120 break;
2122 /* A start or end tag whose tag name is one of: "caption", "col",
2123 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2124 "tbody", "td", "tfoot", "th", "thead", "tr". */
2125 case 'caption': case 'col': case 'colgroup': case 'frame':
2126 case 'frameset': case 'head': case 'option': case 'optgroup':
2127 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2128 case 'tr':
2129 // Parse error. Ignore the token.
2130 break;
2132 /* A start or end tag whose tag name is one of: "event-source",
2133 "section", "nav", "article", "aside", "header", "footer",
2134 "datagrid", "command" */
2135 case 'event-source': case 'section': case 'nav': case 'article':
2136 case 'aside': case 'header': case 'footer': case 'datagrid':
2137 case 'command':
2138 // Work in progress!
2139 break;
2141 /* A start tag token not covered by the previous entries */
2142 default:
2143 /* Reconstruct the active formatting elements, if any. */
2144 $this->reconstructActiveFormattingElements();
2146 $this->insertElement($token, true, true);
2147 break;
2149 break;
2151 case HTML5::ENDTAG:
2152 switch($token['name']) {
2153 /* An end tag with the tag name "body" */
2154 case 'body':
2155 /* If the second element in the stack of open elements is
2156 not a body element, this is a parse error. Ignore the token.
2157 (innerHTML case) */
2158 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2159 // Ignore.
2161 /* If the current node is not the body element, then this
2162 is a parse error. */
2163 } elseif(end($this->stack)->nodeName !== 'body') {
2164 // Parse error.
2167 /* Change the insertion mode to "after body". */
2168 $this->mode = self::AFTER_BODY;
2169 break;
2171 /* An end tag with the tag name "html" */
2172 case 'html':
2173 /* Act as if an end tag with tag name "body" had been seen,
2174 then, if that token wasn't ignored, reprocess the current
2175 token. */
2176 $this->inBody(array(
2177 'name' => 'body',
2178 'type' => HTML5::ENDTAG
2181 return $this->afterBody($token);
2182 break;
2184 /* An end tag whose tag name is one of: "address", "blockquote",
2185 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2186 "ol", "pre", "ul" */
2187 case 'address': case 'blockquote': case 'center': case 'dir':
2188 case 'div': case 'dl': case 'fieldset': case 'listing':
2189 case 'menu': case 'ol': case 'pre': case 'ul':
2190 /* If the stack of open elements has an element in scope
2191 with the same tag name as that of the token, then generate
2192 implied end tags. */
2193 if($this->elementInScope($token['name'])) {
2194 $this->generateImpliedEndTags();
2196 /* Now, if the current node is not an element with
2197 the same tag name as that of the token, then this
2198 is a parse error. */
2199 // w/e
2201 /* If the stack of open elements has an element in
2202 scope with the same tag name as that of the token,
2203 then pop elements from this stack until an element
2204 with that tag name has been popped from the stack. */
2205 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2206 if($this->stack[$n]->nodeName === $token['name']) {
2207 $n = -1;
2210 array_pop($this->stack);
2213 break;
2215 /* An end tag whose tag name is "form" */
2216 case 'form':
2217 /* If the stack of open elements has an element in scope
2218 with the same tag name as that of the token, then generate
2219 implied end tags. */
2220 if($this->elementInScope($token['name'])) {
2221 $this->generateImpliedEndTags();
2225 if(end($this->stack)->nodeName !== $token['name']) {
2226 /* Now, if the current node is not an element with the
2227 same tag name as that of the token, then this is a parse
2228 error. */
2229 // w/e
2231 } else {
2232 /* Otherwise, if the current node is an element with
2233 the same tag name as that of the token pop that element
2234 from the stack. */
2235 array_pop($this->stack);
2238 /* In any case, set the form element pointer to null. */
2239 $this->form_pointer = null;
2240 break;
2242 /* An end tag whose tag name is "p" */
2243 case 'p':
2244 /* If the stack of open elements has a p element in scope,
2245 then generate implied end tags, except for p elements. */
2246 if($this->elementInScope('p')) {
2247 $this->generateImpliedEndTags(array('p'));
2249 /* If the current node is not a p element, then this is
2250 a parse error. */
2251 // k
2253 /* If the stack of open elements has a p element in
2254 scope, then pop elements from this stack until the stack
2255 no longer has a p element in scope. */
2256 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2257 if($this->elementInScope('p')) {
2258 array_pop($this->stack);
2260 } else {
2261 break;
2265 break;
2267 /* An end tag whose tag name is "dd", "dt", or "li" */
2268 case 'dd': case 'dt': case 'li':
2269 /* If the stack of open elements has an element in scope
2270 whose tag name matches the tag name of the token, then
2271 generate implied end tags, except for elements with the
2272 same tag name as the token. */
2273 if($this->elementInScope($token['name'])) {
2274 $this->generateImpliedEndTags(array($token['name']));
2276 /* If the current node is not an element with the same
2277 tag name as the token, then this is a parse error. */
2278 // w/e
2280 /* If the stack of open elements has an element in scope
2281 whose tag name matches the tag name of the token, then
2282 pop elements from this stack until an element with that
2283 tag name has been popped from the stack. */
2284 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2285 if($this->stack[$n]->nodeName === $token['name']) {
2286 $n = -1;
2289 array_pop($this->stack);
2292 break;
2294 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2295 "h5", "h6" */
2296 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2297 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2299 /* If the stack of open elements has in scope an element whose
2300 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2301 generate implied end tags. */
2302 if($this->elementInScope($elements)) {
2303 $this->generateImpliedEndTags();
2305 /* Now, if the current node is not an element with the same
2306 tag name as that of the token, then this is a parse error. */
2307 // w/e
2309 /* If the stack of open elements has in scope an element
2310 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2311 "h6", then pop elements from the stack until an element
2312 with one of those tag names has been popped from the stack. */
2313 while($this->elementInScope($elements)) {
2314 array_pop($this->stack);
2317 break;
2319 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2320 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2321 case 'a': case 'b': case 'big': case 'em': case 'font':
2322 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2323 case 'strong': case 'tt': case 'u':
2324 /* 1. Let the formatting element be the last element in
2325 the list of active formatting elements that:
2326 * is between the end of the list and the last scope
2327 marker in the list, if any, or the start of the list
2328 otherwise, and
2329 * has the same tag name as the token.
2331 while(true) {
2332 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2333 if($this->a_formatting[$a] === self::MARKER) {
2334 break;
2336 } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2337 $formatting_element = $this->a_formatting[$a];
2338 $in_stack = in_array($formatting_element, $this->stack, true);
2339 $fe_af_pos = $a;
2340 break;
2344 /* If there is no such node, or, if that node is
2345 also in the stack of open elements but the element
2346 is not in scope, then this is a parse error. Abort
2347 these steps. The token is ignored. */
2348 if(!isset($formatting_element) || ($in_stack &&
2349 !$this->elementInScope($token['name']))) {
2350 break;
2352 /* Otherwise, if there is such a node, but that node
2353 is not in the stack of open elements, then this is a
2354 parse error; remove the element from the list, and
2355 abort these steps. */
2356 } elseif(isset($formatting_element) && !$in_stack) {
2357 unset($this->a_formatting[$fe_af_pos]);
2358 $this->a_formatting = array_merge($this->a_formatting);
2359 break;
2362 /* 2. Let the furthest block be the topmost node in the
2363 stack of open elements that is lower in the stack
2364 than the formatting element, and is not an element in
2365 the phrasing or formatting categories. There might
2366 not be one. */
2367 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2368 $length = count($this->stack);
2370 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2371 $category = $this->getElementCategory($this->stack[$s]->nodeName);
2373 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2374 $furthest_block = $this->stack[$s];
2378 /* 3. If there is no furthest block, then the UA must
2379 skip the subsequent steps and instead just pop all
2380 the nodes from the bottom of the stack of open
2381 elements, from the current node up to the formatting
2382 element, and remove the formatting element from the
2383 list of active formatting elements. */
2384 if(!isset($furthest_block)) {
2385 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2386 array_pop($this->stack);
2389 unset($this->a_formatting[$fe_af_pos]);
2390 $this->a_formatting = array_merge($this->a_formatting);
2391 break;
2394 /* 4. Let the common ancestor be the element
2395 immediately above the formatting element in the stack
2396 of open elements. */
2397 $common_ancestor = $this->stack[$fe_s_pos - 1];
2399 /* 5. If the furthest block has a parent node, then
2400 remove the furthest block from its parent node. */
2401 if($furthest_block->parentNode !== null) {
2402 $furthest_block->parentNode->removeChild($furthest_block);
2405 /* 6. Let a bookmark note the position of the
2406 formatting element in the list of active formatting
2407 elements relative to the elements on either side
2408 of it in the list. */
2409 $bookmark = $fe_af_pos;
2411 /* 7. Let node and last node be the furthest block.
2412 Follow these steps: */
2413 $node = $furthest_block;
2414 $last_node = $furthest_block;
2416 while(true) {
2417 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2418 /* 7.1 Let node be the element immediately
2419 prior to node in the stack of open elements. */
2420 $node = $this->stack[$n];
2422 /* 7.2 If node is not in the list of active
2423 formatting elements, then remove node from
2424 the stack of open elements and then go back
2425 to step 1. */
2426 if(!in_array($node, $this->a_formatting, true)) {
2427 unset($this->stack[$n]);
2428 $this->stack = array_merge($this->stack);
2430 } else {
2431 break;
2435 /* 7.3 Otherwise, if node is the formatting
2436 element, then go to the next step in the overall
2437 algorithm. */
2438 if($node === $formatting_element) {
2439 break;
2441 /* 7.4 Otherwise, if last node is the furthest
2442 block, then move the aforementioned bookmark to
2443 be immediately after the node in the list of
2444 active formatting elements. */
2445 } elseif($last_node === $furthest_block) {
2446 $bookmark = array_search($node, $this->a_formatting, true) + 1;
2449 /* 7.5 If node has any children, perform a
2450 shallow clone of node, replace the entry for
2451 node in the list of active formatting elements
2452 with an entry for the clone, replace the entry
2453 for node in the stack of open elements with an
2454 entry for the clone, and let node be the clone. */
2455 if($node->hasChildNodes()) {
2456 $clone = $node->cloneNode();
2457 $s_pos = array_search($node, $this->stack, true);
2458 $a_pos = array_search($node, $this->a_formatting, true);
2460 $this->stack[$s_pos] = $clone;
2461 $this->a_formatting[$a_pos] = $clone;
2462 $node = $clone;
2465 /* 7.6 Insert last node into node, first removing
2466 it from its previous parent node if any. */
2467 if($last_node->parentNode !== null) {
2468 $last_node->parentNode->removeChild($last_node);
2471 $node->appendChild($last_node);
2473 /* 7.7 Let last node be node. */
2474 $last_node = $node;
2477 /* 8. Insert whatever last node ended up being in
2478 the previous step into the common ancestor node,
2479 first removing it from its previous parent node if
2480 any. */
2481 if($last_node->parentNode !== null) {
2482 $last_node->parentNode->removeChild($last_node);
2485 $common_ancestor->appendChild($last_node);
2487 /* 9. Perform a shallow clone of the formatting
2488 element. */
2489 $clone = $formatting_element->cloneNode();
2491 /* 10. Take all of the child nodes of the furthest
2492 block and append them to the clone created in the
2493 last step. */
2494 while($furthest_block->hasChildNodes()) {
2495 $child = $furthest_block->firstChild;
2496 $furthest_block->removeChild($child);
2497 $clone->appendChild($child);
2500 /* 11. Append that clone to the furthest block. */
2501 $furthest_block->appendChild($clone);
2503 /* 12. Remove the formatting element from the list
2504 of active formatting elements, and insert the clone
2505 into the list of active formatting elements at the
2506 position of the aforementioned bookmark. */
2507 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2508 unset($this->a_formatting[$fe_af_pos]);
2509 $this->a_formatting = array_merge($this->a_formatting);
2511 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2512 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2513 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2515 /* 13. Remove the formatting element from the stack
2516 of open elements, and insert the clone into the stack
2517 of open elements immediately after (i.e. in a more
2518 deeply nested position than) the position of the
2519 furthest block in that stack. */
2520 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2521 $fb_s_pos = array_search($furthest_block, $this->stack, true);
2522 unset($this->stack[$fe_s_pos]);
2524 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2525 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2526 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2528 /* 14. Jump back to step 1 in this series of steps. */
2529 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2531 break;
2533 /* An end tag token whose tag name is one of: "button",
2534 "marquee", "object" */
2535 case 'button': case 'marquee': case 'object':
2536 /* If the stack of open elements has an element in scope whose
2537 tag name matches the tag name of the token, then generate implied
2538 tags. */
2539 if($this->elementInScope($token['name'])) {
2540 $this->generateImpliedEndTags();
2542 /* Now, if the current node is not an element with the same
2543 tag name as the token, then this is a parse error. */
2544 // k
2546 /* Now, if the stack of open elements has an element in scope
2547 whose tag name matches the tag name of the token, then pop
2548 elements from the stack until that element has been popped from
2549 the stack, and clear the list of active formatting elements up
2550 to the last marker. */
2551 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2552 if($this->stack[$n]->nodeName === $token['name']) {
2553 $n = -1;
2556 array_pop($this->stack);
2559 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2561 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2562 array_pop($this->a_formatting);
2565 break;
2567 /* Or an end tag whose tag name is one of: "area", "basefont",
2568 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2569 "input", "isindex", "noembed", "noframes", "param", "select",
2570 "spacer", "table", "textarea", "wbr" */
2571 case 'area': case 'basefont': case 'bgsound': case 'br':
2572 case 'embed': case 'hr': case 'iframe': case 'image':
2573 case 'img': case 'input': case 'isindex': case 'noembed':
2574 case 'noframes': case 'param': case 'select': case 'spacer':
2575 case 'table': case 'textarea': case 'wbr':
2576 // Parse error. Ignore the token.
2577 break;
2579 /* An end tag token not covered by the previous entries */
2580 default:
2581 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2582 /* Initialise node to be the current node (the bottommost
2583 node of the stack). */
2584 $node = end($this->stack);
2586 /* If node has the same tag name as the end tag token,
2587 then: */
2588 if($token['name'] === $node->nodeName) {
2589 /* Generate implied end tags. */
2590 $this->generateImpliedEndTags();
2592 /* If the tag name of the end tag token does not
2593 match the tag name of the current node, this is a
2594 parse error. */
2595 // k
2597 /* Pop all the nodes from the current node up to
2598 node, including node, then stop this algorithm. */
2599 for($x = count($this->stack) - $n; $x >= $n; $x--) {
2600 array_pop($this->stack);
2603 } else {
2604 $category = $this->getElementCategory($node);
2606 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2607 /* Otherwise, if node is in neither the formatting
2608 category nor the phrasing category, then this is a
2609 parse error. Stop this algorithm. The end tag token
2610 is ignored. */
2611 return false;
2615 break;
2617 break;
2621 private function inTable($token) {
2622 $clear = array('html', 'table');
2624 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2625 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2626 or U+0020 SPACE */
2627 if($token['type'] === HTML5::CHARACTR &&
2628 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2629 /* Append the character to the current node. */
2630 $text = $this->dom->createTextNode($token['data']);
2631 end($this->stack)->appendChild($text);
2633 /* A comment token */
2634 } elseif($token['type'] === HTML5::COMMENT) {
2635 /* Append a Comment node to the current node with the data
2636 attribute set to the data given in the comment token. */
2637 $comment = $this->dom->createComment($token['data']);
2638 end($this->stack)->appendChild($comment);
2640 /* A start tag whose tag name is "caption" */
2641 } elseif($token['type'] === HTML5::STARTTAG &&
2642 $token['name'] === 'caption') {
2643 /* Clear the stack back to a table context. */
2644 $this->clearStackToTableContext($clear);
2646 /* Insert a marker at the end of the list of active
2647 formatting elements. */
2648 $this->a_formatting[] = self::MARKER;
2650 /* Insert an HTML element for the token, then switch the
2651 insertion mode to "in caption". */
2652 $this->insertElement($token);
2653 $this->mode = self::IN_CAPTION;
2655 /* A start tag whose tag name is "colgroup" */
2656 } elseif($token['type'] === HTML5::STARTTAG &&
2657 $token['name'] === 'colgroup') {
2658 /* Clear the stack back to a table context. */
2659 $this->clearStackToTableContext($clear);
2661 /* Insert an HTML element for the token, then switch the
2662 insertion mode to "in column group". */
2663 $this->insertElement($token);
2664 $this->mode = self::IN_CGROUP;
2666 /* A start tag whose tag name is "col" */
2667 } elseif($token['type'] === HTML5::STARTTAG &&
2668 $token['name'] === 'col') {
2669 $this->inTable(array(
2670 'name' => 'colgroup',
2671 'type' => HTML5::STARTTAG,
2672 'attr' => array()
2675 $this->inColumnGroup($token);
2677 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2678 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2679 array('tbody', 'tfoot', 'thead'))) {
2680 /* Clear the stack back to a table context. */
2681 $this->clearStackToTableContext($clear);
2683 /* Insert an HTML element for the token, then switch the insertion
2684 mode to "in table body". */
2685 $this->insertElement($token);
2686 $this->mode = self::IN_TBODY;
2688 /* A start tag whose tag name is one of: "td", "th", "tr" */
2689 } elseif($token['type'] === HTML5::STARTTAG &&
2690 in_array($token['name'], array('td', 'th', 'tr'))) {
2691 /* Act as if a start tag token with the tag name "tbody" had been
2692 seen, then reprocess the current token. */
2693 $this->inTable(array(
2694 'name' => 'tbody',
2695 'type' => HTML5::STARTTAG,
2696 'attr' => array()
2699 return $this->inTableBody($token);
2701 /* A start tag whose tag name is "table" */
2702 } elseif($token['type'] === HTML5::STARTTAG &&
2703 $token['name'] === 'table') {
2704 /* Parse error. Act as if an end tag token with the tag name "table"
2705 had been seen, then, if that token wasn't ignored, reprocess the
2706 current token. */
2707 $this->inTable(array(
2708 'name' => 'table',
2709 'type' => HTML5::ENDTAG
2712 return $this->mainPhase($token);
2714 /* An end tag whose tag name is "table" */
2715 } elseif($token['type'] === HTML5::ENDTAG &&
2716 $token['name'] === 'table') {
2717 /* If the stack of open elements does not have an element in table
2718 scope with the same tag name as the token, this is a parse error.
2719 Ignore the token. (innerHTML case) */
2720 if(!$this->elementInScope($token['name'], true)) {
2721 return false;
2723 /* Otherwise: */
2724 } else {
2725 /* Generate implied end tags. */
2726 $this->generateImpliedEndTags();
2728 /* Now, if the current node is not a table element, then this
2729 is a parse error. */
2730 // w/e
2732 /* Pop elements from this stack until a table element has been
2733 popped from the stack. */
2734 while(true) {
2735 $current = end($this->stack)->nodeName;
2736 array_pop($this->stack);
2738 if($current === 'table') {
2739 break;
2743 /* Reset the insertion mode appropriately. */
2744 $this->resetInsertionMode();
2747 /* An end tag whose tag name is one of: "body", "caption", "col",
2748 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2749 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2750 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2751 'tfoot', 'th', 'thead', 'tr'))) {
2752 // Parse error. Ignore the token.
2754 /* Anything else */
2755 } else {
2756 /* Parse error. Process the token as if the insertion mode was "in
2757 body", with the following exception: */
2759 /* If the current node is a table, tbody, tfoot, thead, or tr
2760 element, then, whenever a node would be inserted into the current
2761 node, it must instead be inserted into the foster parent element. */
2762 if(in_array(end($this->stack)->nodeName,
2763 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2764 /* The foster parent element is the parent element of the last
2765 table element in the stack of open elements, if there is a
2766 table element and it has such a parent element. If there is no
2767 table element in the stack of open elements (innerHTML case),
2768 then the foster parent element is the first element in the
2769 stack of open elements (the html element). Otherwise, if there
2770 is a table element in the stack of open elements, but the last
2771 table element in the stack of open elements has no parent, or
2772 its parent node is not an element, then the foster parent
2773 element is the element before the last table element in the
2774 stack of open elements. */
2775 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2776 if($this->stack[$n]->nodeName === 'table') {
2777 $table = $this->stack[$n];
2778 break;
2782 if(isset($table) && $table->parentNode !== null) {
2783 $this->foster_parent = $table->parentNode;
2785 } elseif(!isset($table)) {
2786 $this->foster_parent = $this->stack[0];
2788 } elseif(isset($table) && ($table->parentNode === null ||
2789 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2790 $this->foster_parent = $this->stack[$n - 1];
2794 $this->inBody($token);
2798 private function inCaption($token) {
2799 /* An end tag whose tag name is "caption" */
2800 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2801 /* If the stack of open elements does not have an element in table
2802 scope with the same tag name as the token, this is a parse error.
2803 Ignore the token. (innerHTML case) */
2804 if(!$this->elementInScope($token['name'], true)) {
2805 // Ignore
2807 /* Otherwise: */
2808 } else {
2809 /* Generate implied end tags. */
2810 $this->generateImpliedEndTags();
2812 /* Now, if the current node is not a caption element, then this
2813 is a parse error. */
2814 // w/e
2816 /* Pop elements from this stack until a caption element has
2817 been popped from the stack. */
2818 while(true) {
2819 $node = end($this->stack)->nodeName;
2820 array_pop($this->stack);
2822 if($node === 'caption') {
2823 break;
2827 /* Clear the list of active formatting elements up to the last
2828 marker. */
2829 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2831 /* Switch the insertion mode to "in table". */
2832 $this->mode = self::IN_TABLE;
2835 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2836 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2837 name is "table" */
2838 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2839 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2840 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2841 $token['name'] === 'table')) {
2842 /* Parse error. Act as if an end tag with the tag name "caption"
2843 had been seen, then, if that token wasn't ignored, reprocess the
2844 current token. */
2845 $this->inCaption(array(
2846 'name' => 'caption',
2847 'type' => HTML5::ENDTAG
2850 return $this->inTable($token);
2852 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2853 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2854 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2855 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2856 'thead', 'tr'))) {
2857 // Parse error. Ignore the token.
2859 /* Anything else */
2860 } else {
2861 /* Process the token as if the insertion mode was "in body". */
2862 $this->inBody($token);
2866 private function inColumnGroup($token) {
2867 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2868 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2869 or U+0020 SPACE */
2870 if($token['type'] === HTML5::CHARACTR &&
2871 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2872 /* Append the character to the current node. */
2873 $text = $this->dom->createTextNode($token['data']);
2874 end($this->stack)->appendChild($text);
2876 /* A comment token */
2877 } elseif($token['type'] === HTML5::COMMENT) {
2878 /* Append a Comment node to the current node with the data
2879 attribute set to the data given in the comment token. */
2880 $comment = $this->dom->createComment($token['data']);
2881 end($this->stack)->appendChild($comment);
2883 /* A start tag whose tag name is "col" */
2884 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2885 /* Insert a col element for the token. Immediately pop the current
2886 node off the stack of open elements. */
2887 $this->insertElement($token);
2888 array_pop($this->stack);
2890 /* An end tag whose tag name is "colgroup" */
2891 } elseif($token['type'] === HTML5::ENDTAG &&
2892 $token['name'] === 'colgroup') {
2893 /* If the current node is the root html element, then this is a
2894 parse error, ignore the token. (innerHTML case) */
2895 if(end($this->stack)->nodeName === 'html') {
2896 // Ignore
2898 /* Otherwise, pop the current node (which will be a colgroup
2899 element) from the stack of open elements. Switch the insertion
2900 mode to "in table". */
2901 } else {
2902 array_pop($this->stack);
2903 $this->mode = self::IN_TABLE;
2906 /* An end tag whose tag name is "col" */
2907 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2908 /* Parse error. Ignore the token. */
2910 /* Anything else */
2911 } else {
2912 /* Act as if an end tag with the tag name "colgroup" had been seen,
2913 and then, if that token wasn't ignored, reprocess the current token. */
2914 $this->inColumnGroup(array(
2915 'name' => 'colgroup',
2916 'type' => HTML5::ENDTAG
2919 return $this->inTable($token);
2923 private function inTableBody($token) {
2924 $clear = array('tbody', 'tfoot', 'thead', 'html');
2926 /* A start tag whose tag name is "tr" */
2927 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2928 /* Clear the stack back to a table body context. */
2929 $this->clearStackToTableContext($clear);
2931 /* Insert a tr element for the token, then switch the insertion
2932 mode to "in row". */
2933 $this->insertElement($token);
2934 $this->mode = self::IN_ROW;
2936 /* A start tag whose tag name is one of: "th", "td" */
2937 } elseif($token['type'] === HTML5::STARTTAG &&
2938 ($token['name'] === 'th' || $token['name'] === 'td')) {
2939 /* Parse error. Act as if a start tag with the tag name "tr" had
2940 been seen, then reprocess the current token. */
2941 $this->inTableBody(array(
2942 'name' => 'tr',
2943 'type' => HTML5::STARTTAG,
2944 'attr' => array()
2947 return $this->inRow($token);
2949 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2950 } elseif($token['type'] === HTML5::ENDTAG &&
2951 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2952 /* If the stack of open elements does not have an element in table
2953 scope with the same tag name as the token, this is a parse error.
2954 Ignore the token. */
2955 if(!$this->elementInScope($token['name'], true)) {
2956 // Ignore
2958 /* Otherwise: */
2959 } else {
2960 /* Clear the stack back to a table body context. */
2961 $this->clearStackToTableContext($clear);
2963 /* Pop the current node from the stack of open elements. Switch
2964 the insertion mode to "in table". */
2965 array_pop($this->stack);
2966 $this->mode = self::IN_TABLE;
2969 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2970 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2971 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2972 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2973 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2974 /* If the stack of open elements does not have a tbody, thead, or
2975 tfoot element in table scope, this is a parse error. Ignore the
2976 token. (innerHTML case) */
2977 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2978 // Ignore.
2980 /* Otherwise: */
2981 } else {
2982 /* Clear the stack back to a table body context. */
2983 $this->clearStackToTableContext($clear);
2985 /* Act as if an end tag with the same tag name as the current
2986 node ("tbody", "tfoot", or "thead") had been seen, then
2987 reprocess the current token. */
2988 $this->inTableBody(array(
2989 'name' => end($this->stack)->nodeName,
2990 'type' => HTML5::ENDTAG
2993 return $this->mainPhase($token);
2996 /* An end tag whose tag name is one of: "body", "caption", "col",
2997 "colgroup", "html", "td", "th", "tr" */
2998 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2999 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3000 /* Parse error. Ignore the token. */
3002 /* Anything else */
3003 } else {
3004 /* Process the token as if the insertion mode was "in table". */
3005 $this->inTable($token);
3009 private function inRow($token) {
3010 $clear = array('tr', 'html');
3012 /* A start tag whose tag name is one of: "th", "td" */
3013 if($token['type'] === HTML5::STARTTAG &&
3014 ($token['name'] === 'th' || $token['name'] === 'td')) {
3015 /* Clear the stack back to a table row context. */
3016 $this->clearStackToTableContext($clear);
3018 /* Insert an HTML element for the token, then switch the insertion
3019 mode to "in cell". */
3020 $this->insertElement($token);
3021 $this->mode = self::IN_CELL;
3023 /* Insert a marker at the end of the list of active formatting
3024 elements. */
3025 $this->a_formatting[] = self::MARKER;
3027 /* An end tag whose tag name is "tr" */
3028 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3029 /* If the stack of open elements does not have an element in table
3030 scope with the same tag name as the token, this is a parse error.
3031 Ignore the token. (innerHTML case) */
3032 if(!$this->elementInScope($token['name'], true)) {
3033 // Ignore.
3035 /* Otherwise: */
3036 } else {
3037 /* Clear the stack back to a table row context. */
3038 $this->clearStackToTableContext($clear);
3040 /* Pop the current node (which will be a tr element) from the
3041 stack of open elements. Switch the insertion mode to "in table
3042 body". */
3043 array_pop($this->stack);
3044 $this->mode = self::IN_TBODY;
3047 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3048 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3049 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3050 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3051 /* Act as if an end tag with the tag name "tr" had been seen, then,
3052 if that token wasn't ignored, reprocess the current token. */
3053 $this->inRow(array(
3054 'name' => 'tr',
3055 'type' => HTML5::ENDTAG
3058 return $this->inCell($token);
3060 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3061 } elseif($token['type'] === HTML5::ENDTAG &&
3062 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3063 /* If the stack of open elements does not have an element in table
3064 scope with the same tag name as the token, this is a parse error.
3065 Ignore the token. */
3066 if(!$this->elementInScope($token['name'], true)) {
3067 // Ignore.
3069 /* Otherwise: */
3070 } else {
3071 /* Otherwise, act as if an end tag with the tag name "tr" had
3072 been seen, then reprocess the current token. */
3073 $this->inRow(array(
3074 'name' => 'tr',
3075 'type' => HTML5::ENDTAG
3078 return $this->inCell($token);
3081 /* An end tag whose tag name is one of: "body", "caption", "col",
3082 "colgroup", "html", "td", "th" */
3083 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3084 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3085 /* Parse error. Ignore the token. */
3087 /* Anything else */
3088 } else {
3089 /* Process the token as if the insertion mode was "in table". */
3090 $this->inTable($token);
3094 private function inCell($token) {
3095 /* An end tag whose tag name is one of: "td", "th" */
3096 if($token['type'] === HTML5::ENDTAG &&
3097 ($token['name'] === 'td' || $token['name'] === 'th')) {
3098 /* If the stack of open elements does not have an element in table
3099 scope with the same tag name as that of the token, then this is a
3100 parse error and the token must be ignored. */
3101 if(!$this->elementInScope($token['name'], true)) {
3102 // Ignore.
3104 /* Otherwise: */
3105 } else {
3106 /* Generate implied end tags, except for elements with the same
3107 tag name as the token. */
3108 $this->generateImpliedEndTags(array($token['name']));
3110 /* Now, if the current node is not an element with the same tag
3111 name as the token, then this is a parse error. */
3112 // k
3114 /* Pop elements from this stack until an element with the same
3115 tag name as the token has been popped from the stack. */
3116 while(true) {
3117 $node = end($this->stack)->nodeName;
3118 array_pop($this->stack);
3120 if($node === $token['name']) {
3121 break;
3125 /* Clear the list of active formatting elements up to the last
3126 marker. */
3127 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3129 /* Switch the insertion mode to "in row". (The current node
3130 will be a tr element at this point.) */
3131 $this->mode = self::IN_ROW;
3134 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3135 "tbody", "td", "tfoot", "th", "thead", "tr" */
3136 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3137 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3138 'thead', 'tr'))) {
3139 /* If the stack of open elements does not have a td or th element
3140 in table scope, then this is a parse error; ignore the token.
3141 (innerHTML case) */
3142 if(!$this->elementInScope(array('td', 'th'), true)) {
3143 // Ignore.
3145 /* Otherwise, close the cell (see below) and reprocess the current
3146 token. */
3147 } else {
3148 $this->closeCell();
3149 return $this->inRow($token);
3152 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3153 "tbody", "td", "tfoot", "th", "thead", "tr" */
3154 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3155 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3156 'thead', 'tr'))) {
3157 /* If the stack of open elements does not have a td or th element
3158 in table scope, then this is a parse error; ignore the token.
3159 (innerHTML case) */
3160 if(!$this->elementInScope(array('td', 'th'), true)) {
3161 // Ignore.
3163 /* Otherwise, close the cell (see below) and reprocess the current
3164 token. */
3165 } else {
3166 $this->closeCell();
3167 return $this->inRow($token);
3170 /* An end tag whose tag name is one of: "body", "caption", "col",
3171 "colgroup", "html" */
3172 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3173 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3174 /* Parse error. Ignore the token. */
3176 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3177 "thead", "tr" */
3178 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3179 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3180 /* If the stack of open elements does not have an element in table
3181 scope with the same tag name as that of the token (which can only
3182 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3183 then this is a parse error and the token must be ignored. */
3184 if(!$this->elementInScope($token['name'], true)) {
3185 // Ignore.
3187 /* Otherwise, close the cell (see below) and reprocess the current
3188 token. */
3189 } else {
3190 $this->closeCell();
3191 return $this->inRow($token);
3194 /* Anything else */
3195 } else {
3196 /* Process the token as if the insertion mode was "in body". */
3197 $this->inBody($token);
3201 private function inSelect($token) {
3202 /* Handle the token as follows: */
3204 /* A character token */
3205 if($token['type'] === HTML5::CHARACTR) {
3206 /* Append the token's character to the current node. */
3207 $this->insertText($token['data']);
3209 /* A comment token */
3210 } elseif($token['type'] === HTML5::COMMENT) {
3211 /* Append a Comment node to the current node with the data
3212 attribute set to the data given in the comment token. */
3213 $this->insertComment($token['data']);
3215 /* A start tag token whose tag name is "option" */
3216 } elseif($token['type'] === HTML5::STARTTAG &&
3217 $token['name'] === 'option') {
3218 /* If the current node is an option element, act as if an end tag
3219 with the tag name "option" had been seen. */
3220 if(end($this->stack)->nodeName === 'option') {
3221 $this->inSelect(array(
3222 'name' => 'option',
3223 'type' => HTML5::ENDTAG
3227 /* Insert an HTML element for the token. */
3228 $this->insertElement($token);
3230 /* A start tag token whose tag name is "optgroup" */
3231 } elseif($token['type'] === HTML5::STARTTAG &&
3232 $token['name'] === 'optgroup') {
3233 /* If the current node is an option element, act as if an end tag
3234 with the tag name "option" had been seen. */
3235 if(end($this->stack)->nodeName === 'option') {
3236 $this->inSelect(array(
3237 'name' => 'option',
3238 'type' => HTML5::ENDTAG
3242 /* If the current node is an optgroup element, act as if an end tag
3243 with the tag name "optgroup" had been seen. */
3244 if(end($this->stack)->nodeName === 'optgroup') {
3245 $this->inSelect(array(
3246 'name' => 'optgroup',
3247 'type' => HTML5::ENDTAG
3251 /* Insert an HTML element for the token. */
3252 $this->insertElement($token);
3254 /* An end tag token whose tag name is "optgroup" */
3255 } elseif($token['type'] === HTML5::ENDTAG &&
3256 $token['name'] === 'optgroup') {
3257 /* First, if the current node is an option element, and the node
3258 immediately before it in the stack of open elements is an optgroup
3259 element, then act as if an end tag with the tag name "option" had
3260 been seen. */
3261 $elements_in_stack = count($this->stack);
3263 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3264 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3265 $this->inSelect(array(
3266 'name' => 'option',
3267 'type' => HTML5::ENDTAG
3271 /* If the current node is an optgroup element, then pop that node
3272 from the stack of open elements. Otherwise, this is a parse error,
3273 ignore the token. */
3274 if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3275 array_pop($this->stack);
3278 /* An end tag token whose tag name is "option" */
3279 } elseif($token['type'] === HTML5::ENDTAG &&
3280 $token['name'] === 'option') {
3281 /* If the current node is an option element, then pop that node
3282 from the stack of open elements. Otherwise, this is a parse error,
3283 ignore the token. */
3284 if(end($this->stack)->nodeName === 'option') {
3285 array_pop($this->stack);
3288 /* An end tag whose tag name is "select" */
3289 } elseif($token['type'] === HTML5::ENDTAG &&
3290 $token['name'] === 'select') {
3291 /* If the stack of open elements does not have an element in table
3292 scope with the same tag name as the token, this is a parse error.
3293 Ignore the token. (innerHTML case) */
3294 if(!$this->elementInScope($token['name'], true)) {
3295 // w/e
3297 /* Otherwise: */
3298 } else {
3299 /* Pop elements from the stack of open elements until a select
3300 element has been popped from the stack. */
3301 while(true) {
3302 $current = end($this->stack)->nodeName;
3303 array_pop($this->stack);
3305 if($current === 'select') {
3306 break;
3310 /* Reset the insertion mode appropriately. */
3311 $this->resetInsertionMode();
3314 /* A start tag whose tag name is "select" */
3315 } elseif($token['name'] === 'select' &&
3316 $token['type'] === HTML5::STARTTAG) {
3317 /* Parse error. Act as if the token had been an end tag with the
3318 tag name "select" instead. */
3319 $this->inSelect(array(
3320 'name' => 'select',
3321 'type' => HTML5::ENDTAG
3324 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3325 "tfoot", "thead", "tr", "td", "th" */
3326 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3327 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3328 /* Parse error. */
3329 // w/e
3331 /* If the stack of open elements has an element in table scope with
3332 the same tag name as that of the token, then act as if an end tag
3333 with the tag name "select" had been seen, and reprocess the token.
3334 Otherwise, ignore the token. */
3335 if($this->elementInScope($token['name'], true)) {
3336 $this->inSelect(array(
3337 'name' => 'select',
3338 'type' => HTML5::ENDTAG
3341 $this->mainPhase($token);
3344 /* Anything else */
3345 } else {
3346 /* Parse error. Ignore the token. */
3350 private function afterBody($token) {
3351 /* Handle the token as follows: */
3353 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3354 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3355 or U+0020 SPACE */
3356 if($token['type'] === HTML5::CHARACTR &&
3357 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3358 /* Process the token as it would be processed if the insertion mode
3359 was "in body". */
3360 $this->inBody($token);
3362 /* A comment token */
3363 } elseif($token['type'] === HTML5::COMMENT) {
3364 /* Append a Comment node to the first element in the stack of open
3365 elements (the html element), with the data attribute set to the
3366 data given in the comment token. */
3367 $comment = $this->dom->createComment($token['data']);
3368 $this->stack[0]->appendChild($comment);
3370 /* An end tag with the tag name "html" */
3371 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3372 /* If the parser was originally created in order to handle the
3373 setting of an element's innerHTML attribute, this is a parse error;
3374 ignore the token. (The element will be an html element in this
3375 case.) (innerHTML case) */
3377 /* Otherwise, switch to the trailing end phase. */
3378 $this->phase = self::END_PHASE;
3380 /* Anything else */
3381 } else {
3382 /* Parse error. Set the insertion mode to "in body" and reprocess
3383 the token. */
3384 $this->mode = self::IN_BODY;
3385 return $this->inBody($token);
3389 private function inFrameset($token) {
3390 /* Handle the token as follows: */
3392 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3393 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3394 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3395 if($token['type'] === HTML5::CHARACTR &&
3396 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3397 /* Append the character to the current node. */
3398 $this->insertText($token['data']);
3400 /* A comment token */
3401 } elseif($token['type'] === HTML5::COMMENT) {
3402 /* Append a Comment node to the current node with the data
3403 attribute set to the data given in the comment token. */
3404 $this->insertComment($token['data']);
3406 /* A start tag with the tag name "frameset" */
3407 } elseif($token['name'] === 'frameset' &&
3408 $token['type'] === HTML5::STARTTAG) {
3409 $this->insertElement($token);
3411 /* An end tag with the tag name "frameset" */
3412 } elseif($token['name'] === 'frameset' &&
3413 $token['type'] === HTML5::ENDTAG) {
3414 /* If the current node is the root html element, then this is a
3415 parse error; ignore the token. (innerHTML case) */
3416 if(end($this->stack)->nodeName === 'html') {
3417 // Ignore
3419 } else {
3420 /* Otherwise, pop the current node from the stack of open
3421 elements. */
3422 array_pop($this->stack);
3424 /* If the parser was not originally created in order to handle
3425 the setting of an element's innerHTML attribute (innerHTML case),
3426 and the current node is no longer a frameset element, then change
3427 the insertion mode to "after frameset". */
3428 $this->mode = self::AFTR_FRAME;
3431 /* A start tag with the tag name "frame" */
3432 } elseif($token['name'] === 'frame' &&
3433 $token['type'] === HTML5::STARTTAG) {
3434 /* Insert an HTML element for the token. */
3435 $this->insertElement($token);
3437 /* Immediately pop the current node off the stack of open elements. */
3438 array_pop($this->stack);
3440 /* A start tag with the tag name "noframes" */
3441 } elseif($token['name'] === 'noframes' &&
3442 $token['type'] === HTML5::STARTTAG) {
3443 /* Process the token as if the insertion mode had been "in body". */
3444 $this->inBody($token);
3446 /* Anything else */
3447 } else {
3448 /* Parse error. Ignore the token. */
3452 private function afterFrameset($token) {
3453 /* Handle the token as follows: */
3455 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3456 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3457 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3458 if($token['type'] === HTML5::CHARACTR &&
3459 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3460 /* Append the character to the current node. */
3461 $this->insertText($token['data']);
3463 /* A comment token */
3464 } elseif($token['type'] === HTML5::COMMENT) {
3465 /* Append a Comment node to the current node with the data
3466 attribute set to the data given in the comment token. */
3467 $this->insertComment($token['data']);
3469 /* An end tag with the tag name "html" */
3470 } elseif($token['name'] === 'html' &&
3471 $token['type'] === HTML5::ENDTAG) {
3472 /* Switch to the trailing end phase. */
3473 $this->phase = self::END_PHASE;
3475 /* A start tag with the tag name "noframes" */
3476 } elseif($token['name'] === 'noframes' &&
3477 $token['type'] === HTML5::STARTTAG) {
3478 /* Process the token as if the insertion mode had been "in body". */
3479 $this->inBody($token);
3481 /* Anything else */
3482 } else {
3483 /* Parse error. Ignore the token. */
3487 private function trailingEndPhase($token) {
3488 /* After the main phase, as each token is emitted from the tokenisation
3489 stage, it must be processed as described in this section. */
3491 /* A DOCTYPE token */
3492 if($token['type'] === HTML5::DOCTYPE) {
3493 // Parse error. Ignore the token.
3495 /* A comment token */
3496 } elseif($token['type'] === HTML5::COMMENT) {
3497 /* Append a Comment node to the Document object with the data
3498 attribute set to the data given in the comment token. */
3499 $comment = $this->dom->createComment($token['data']);
3500 $this->dom->appendChild($comment);
3502 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3503 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3504 or U+0020 SPACE */
3505 } elseif($token['type'] === HTML5::CHARACTR &&
3506 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3507 /* Process the token as it would be processed in the main phase. */
3508 $this->mainPhase($token);
3510 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3511 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3512 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3513 } elseif(($token['type'] === HTML5::CHARACTR &&
3514 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3515 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3516 /* Parse error. Switch back to the main phase and reprocess the
3517 token. */
3518 $this->phase = self::MAIN_PHASE;
3519 return $this->mainPhase($token);
3521 /* An end-of-file token */
3522 } elseif($token['type'] === HTML5::EOF) {
3523 /* OMG DONE!! */
3527 private function insertElement($token, $append = true, $check = false) {
3528 // Proprietary workaround for libxml2's limitations with tag names
3529 if ($check) {
3530 // Slightly modified HTML5 tag-name modification,
3531 // removing anything that's not an ASCII letter, digit, or hyphen
3532 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3533 // Remove leading hyphens and numbers
3534 $token['name'] = ltrim($token['name'], '-0..9');
3535 // In theory, this should ever be needed, but just in case
3536 if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3539 $el = $this->dom->createElement($token['name']);
3541 foreach($token['attr'] as $attr) {
3542 if(!$el->hasAttribute($attr['name'])) {
3543 $el->setAttribute($attr['name'], $attr['value']);
3547 $this->appendToRealParent($el);
3548 $this->stack[] = $el;
3550 return $el;
3553 private function insertText($data) {
3554 $text = $this->dom->createTextNode($data);
3555 $this->appendToRealParent($text);
3558 private function insertComment($data) {
3559 $comment = $this->dom->createComment($data);
3560 $this->appendToRealParent($comment);
3563 private function appendToRealParent($node) {
3564 if($this->foster_parent === null) {
3565 end($this->stack)->appendChild($node);
3567 } elseif($this->foster_parent !== null) {
3568 /* If the foster parent element is the parent element of the
3569 last table element in the stack of open elements, then the new
3570 node must be inserted immediately before the last table element
3571 in the stack of open elements in the foster parent element;
3572 otherwise, the new node must be appended to the foster parent
3573 element. */
3574 for($n = count($this->stack) - 1; $n >= 0; $n--) {
3575 if($this->stack[$n]->nodeName === 'table' &&
3576 $this->stack[$n]->parentNode !== null) {
3577 $table = $this->stack[$n];
3578 break;
3582 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3583 $this->foster_parent->insertBefore($node, $table);
3584 else
3585 $this->foster_parent->appendChild($node);
3587 $this->foster_parent = null;
3591 private function elementInScope($el, $table = false) {
3592 if(is_array($el)) {
3593 foreach($el as $element) {
3594 if($this->elementInScope($element, $table)) {
3595 return true;
3599 return false;
3602 $leng = count($this->stack);
3604 for($n = 0; $n < $leng; $n++) {
3605 /* 1. Initialise node to be the current node (the bottommost node of
3606 the stack). */
3607 $node = $this->stack[$leng - 1 - $n];
3609 if($node->tagName === $el) {
3610 /* 2. If node is the target node, terminate in a match state. */
3611 return true;
3613 } elseif($node->tagName === 'table') {
3614 /* 3. Otherwise, if node is a table element, terminate in a failure
3615 state. */
3616 return false;
3618 } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3619 'th', 'button', 'marquee', 'object'))) {
3620 /* 4. Otherwise, if the algorithm is the "has an element in scope"
3621 variant (rather than the "has an element in table scope" variant),
3622 and node is one of the following, terminate in a failure state. */
3623 return false;
3625 } elseif($node === $node->ownerDocument->documentElement) {
3626 /* 5. Otherwise, if node is an html element (root element), terminate
3627 in a failure state. (This can only happen if the node is the topmost
3628 node of the stack of open elements, and prevents the next step from
3629 being invoked if there are no more elements in the stack.) */
3630 return false;
3633 /* Otherwise, set node to the previous entry in the stack of open
3634 elements and return to step 2. (This will never fail, since the loop
3635 will always terminate in the previous step if the top of the stack
3636 is reached.) */
3640 private function reconstructActiveFormattingElements() {
3641 /* 1. If there are no entries in the list of active formatting elements,
3642 then there is nothing to reconstruct; stop this algorithm. */
3643 $formatting_elements = count($this->a_formatting);
3645 if($formatting_elements === 0) {
3646 return false;
3649 /* 3. Let entry be the last (most recently added) element in the list
3650 of active formatting elements. */
3651 $entry = end($this->a_formatting);
3653 /* 2. If the last (most recently added) entry in the list of active
3654 formatting elements is a marker, or if it is an element that is in the
3655 stack of open elements, then there is nothing to reconstruct; stop this
3656 algorithm. */
3657 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3658 return false;
3661 for($a = $formatting_elements - 1; $a >= 0; true) {
3662 /* 4. If there are no entries before entry in the list of active
3663 formatting elements, then jump to step 8. */
3664 if($a === 0) {
3665 $step_seven = false;
3666 break;
3669 /* 5. Let entry be the entry one earlier than entry in the list of
3670 active formatting elements. */
3671 $a--;
3672 $entry = $this->a_formatting[$a];
3674 /* 6. If entry is neither a marker nor an element that is also in
3675 thetack of open elements, go to step 4. */
3676 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3677 break;
3681 while(true) {
3682 /* 7. Let entry be the element one later than entry in the list of
3683 active formatting elements. */
3684 if(isset($step_seven) && $step_seven === true) {
3685 $a++;
3686 $entry = $this->a_formatting[$a];
3689 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3690 $clone = $entry->cloneNode();
3692 /* 9. Append clone to the current node and push it onto the stack
3693 of open elements so that it is the new current node. */
3694 end($this->stack)->appendChild($clone);
3695 $this->stack[] = $clone;
3697 /* 10. Replace the entry for entry in the list with an entry for
3698 clone. */
3699 $this->a_formatting[$a] = $clone;
3701 /* 11. If the entry for clone in the list of active formatting
3702 elements is not the last entry in the list, return to step 7. */
3703 if(end($this->a_formatting) !== $clone) {
3704 $step_seven = true;
3705 } else {
3706 break;
3711 private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3712 /* When the steps below require the UA to clear the list of active
3713 formatting elements up to the last marker, the UA must perform the
3714 following steps: */
3716 while(true) {
3717 /* 1. Let entry be the last (most recently added) entry in the list
3718 of active formatting elements. */
3719 $entry = end($this->a_formatting);
3721 /* 2. Remove entry from the list of active formatting elements. */
3722 array_pop($this->a_formatting);
3724 /* 3. If entry was a marker, then stop the algorithm at this point.
3725 The list has been cleared up to the last marker. */
3726 if($entry === self::MARKER) {
3727 break;
3732 private function generateImpliedEndTags($exclude = array()) {
3733 /* When the steps below require the UA to generate implied end tags,
3734 then, if the current node is a dd element, a dt element, an li element,
3735 a p element, a td element, a th element, or a tr element, the UA must
3736 act as if an end tag with the respective tag name had been seen and
3737 then generate implied end tags again. */
3738 $node = end($this->stack);
3739 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3741 while(in_array(end($this->stack)->nodeName, $elements)) {
3742 array_pop($this->stack);
3746 private function getElementCategory($node) {
3747 $name = $node->tagName;
3748 if(in_array($name, $this->special))
3749 return self::SPECIAL;
3751 elseif(in_array($name, $this->scoping))
3752 return self::SCOPING;
3754 elseif(in_array($name, $this->formatting))
3755 return self::FORMATTING;
3757 else
3758 return self::PHRASING;
3761 private function clearStackToTableContext($elements) {
3762 /* When the steps above require the UA to clear the stack back to a
3763 table context, it means that the UA must, while the current node is not
3764 a table element or an html element, pop elements from the stack of open
3765 elements. If this causes any elements to be popped from the stack, then
3766 this is a parse error. */
3767 while(true) {
3768 $node = end($this->stack)->nodeName;
3770 if(in_array($node, $elements)) {
3771 break;
3772 } else {
3773 array_pop($this->stack);
3778 private function resetInsertionMode() {
3779 /* 1. Let last be false. */
3780 $last = false;
3781 $leng = count($this->stack);
3783 for($n = $leng - 1; $n >= 0; $n--) {
3784 /* 2. Let node be the last node in the stack of open elements. */
3785 $node = $this->stack[$n];
3787 /* 3. If node is the first node in the stack of open elements, then
3788 set last to true. If the element whose innerHTML attribute is being
3789 set is neither a td element nor a th element, then set node to the
3790 element whose innerHTML attribute is being set. (innerHTML case) */
3791 if($this->stack[0]->isSameNode($node)) {
3792 $last = true;
3795 /* 4. If node is a select element, then switch the insertion mode to
3796 "in select" and abort these steps. (innerHTML case) */
3797 if($node->nodeName === 'select') {
3798 $this->mode = self::IN_SELECT;
3799 break;
3801 /* 5. If node is a td or th element, then switch the insertion mode
3802 to "in cell" and abort these steps. */
3803 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3804 $this->mode = self::IN_CELL;
3805 break;
3807 /* 6. If node is a tr element, then switch the insertion mode to
3808 "in row" and abort these steps. */
3809 } elseif($node->nodeName === 'tr') {
3810 $this->mode = self::IN_ROW;
3811 break;
3813 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3814 insertion mode to "in table body" and abort these steps. */
3815 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3816 $this->mode = self::IN_TBODY;
3817 break;
3819 /* 8. If node is a caption element, then switch the insertion mode
3820 to "in caption" and abort these steps. */
3821 } elseif($node->nodeName === 'caption') {
3822 $this->mode = self::IN_CAPTION;
3823 break;
3825 /* 9. If node is a colgroup element, then switch the insertion mode
3826 to "in column group" and abort these steps. (innerHTML case) */
3827 } elseif($node->nodeName === 'colgroup') {
3828 $this->mode = self::IN_CGROUP;
3829 break;
3831 /* 10. If node is a table element, then switch the insertion mode
3832 to "in table" and abort these steps. */
3833 } elseif($node->nodeName === 'table') {
3834 $this->mode = self::IN_TABLE;
3835 break;
3837 /* 11. If node is a head element, then switch the insertion mode
3838 to "in body" ("in body"! not "in head"!) and abort these steps.
3839 (innerHTML case) */
3840 } elseif($node->nodeName === 'head') {
3841 $this->mode = self::IN_BODY;
3842 break;
3844 /* 12. If node is a body element, then switch the insertion mode to
3845 "in body" and abort these steps. */
3846 } elseif($node->nodeName === 'body') {
3847 $this->mode = self::IN_BODY;
3848 break;
3850 /* 13. If node is a frameset element, then switch the insertion
3851 mode to "in frameset" and abort these steps. (innerHTML case) */
3852 } elseif($node->nodeName === 'frameset') {
3853 $this->mode = self::IN_FRAME;
3854 break;
3856 /* 14. If node is an html element, then: if the head element
3857 pointer is null, switch the insertion mode to "before head",
3858 otherwise, switch the insertion mode to "after head". In either
3859 case, abort these steps. (innerHTML case) */
3860 } elseif($node->nodeName === 'html') {
3861 $this->mode = ($this->head_pointer === null)
3862 ? self::BEFOR_HEAD
3863 : self::AFTER_HEAD;
3865 break;
3867 /* 15. If last is true, then set the insertion mode to "in body"
3868 and abort these steps. (innerHTML case) */
3869 } elseif($last) {
3870 $this->mode = self::IN_BODY;
3871 break;
3876 private function closeCell() {
3877 /* If the stack of open elements has a td or th element in table scope,
3878 then act as if an end tag token with that tag name had been seen. */
3879 foreach(array('td', 'th') as $cell) {
3880 if($this->elementInScope($cell, true)) {
3881 $this->inCell(array(
3882 'name' => $cell,
3883 'type' => HTML5::ENDTAG
3886 break;
3891 public function save() {
3892 return $this->dom;