Fix autoparagraph bug with non-inline elements.
[htmlpurifier.git] / maintenance / PH5P.php
blob96d0d13f92ac223288db315bc3e9168ace44aa0b
1 <?php
2 class HTML5 {
3 private $data;
4 private $char;
5 private $EOF;
6 private $state;
7 private $tree;
8 private $token;
9 private $content_model;
10 private $escape = false;
11 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
12 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
13 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
14 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
15 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
16 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
17 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
18 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
19 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
20 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
21 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
22 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
23 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
24 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
25 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
26 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
27 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
28 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
29 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
30 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
31 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
32 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
33 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
34 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
35 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
36 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
37 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
38 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
39 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
40 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
41 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
42 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
43 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
44 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
45 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
46 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
47 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
48 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
49 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
50 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
51 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
52 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
54 const PCDATA = 0;
55 const RCDATA = 1;
56 const CDATA = 2;
57 const PLAINTEXT = 3;
59 const DOCTYPE = 0;
60 const STARTTAG = 1;
61 const ENDTAG = 2;
62 const COMMENT = 3;
63 const CHARACTR = 4;
64 const EOF = 5;
66 public function __construct($data) {
67 $data = str_replace("\r\n", "\n", $data);
68 $date = str_replace("\r", null, $data);
70 $this->data = $data;
71 $this->char = -1;
72 $this->EOF = strlen($data);
73 $this->tree = new HTML5TreeConstructer;
74 $this->content_model = self::PCDATA;
76 $this->state = 'data';
78 while($this->state !== null) {
79 $this->{$this->state.'State'}();
83 public function save() {
84 return $this->tree->save();
87 private function char() {
88 return ($this->char < $this->EOF)
89 ? $this->data[$this->char]
90 : false;
93 private function character($s, $l = 0) {
94 if($s + $l < $this->EOF) {
95 if($l === 0) {
96 return $this->data[$s];
97 } else {
98 return substr($this->data, $s, $l);
103 private function characters($char_class, $start) {
104 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
107 private function dataState() {
108 // Consume the next input character
109 $this->char++;
110 $char = $this->char();
112 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
113 /* U+0026 AMPERSAND (&)
114 When the content model flag is set to one of the PCDATA or RCDATA
115 states: switch to the entity data state. Otherwise: treat it as per
116 the "anything else" entry below. */
117 $this->state = 'entityData';
119 } elseif($char === '-') {
120 /* If the content model flag is set to either the RCDATA state or
121 the CDATA state, and the escape flag is false, and there are at
122 least three characters before this one in the input stream, and the
123 last four characters in the input stream, including this one, are
124 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
125 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
126 if(($this->content_model === self::RCDATA || $this->content_model ===
127 self::CDATA) && $this->escape === false &&
128 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
129 $this->escape = true;
132 /* In any case, emit the input character as a character token. Stay
133 in the data state. */
134 $this->emitToken(array(
135 'type' => self::CHARACTR,
136 'data' => $char
139 /* U+003C LESS-THAN SIGN (<) */
140 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
141 (($this->content_model === self::RCDATA ||
142 $this->content_model === self::CDATA) && $this->escape === false))) {
143 /* When the content model flag is set to the PCDATA state: switch
144 to the tag open state.
146 When the content model flag is set to either the RCDATA state or
147 the CDATA state and the escape flag is false: switch to the tag
148 open state.
150 Otherwise: treat it as per the "anything else" entry below. */
151 $this->state = 'tagOpen';
153 /* U+003E GREATER-THAN SIGN (>) */
154 } elseif($char === '>') {
155 /* If the content model flag is set to either the RCDATA state or
156 the CDATA state, and the escape flag is true, and the last three
157 characters in the input stream including this one are U+002D
158 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
159 set the escape flag to false. */
160 if(($this->content_model === self::RCDATA ||
161 $this->content_model === self::CDATA) && $this->escape === true &&
162 $this->character($this->char, 3) === '-->') {
163 $this->escape = false;
166 /* In any case, emit the input character as a character token.
167 Stay in the data state. */
168 $this->emitToken(array(
169 'type' => self::CHARACTR,
170 'data' => $char
173 } elseif($this->char === $this->EOF) {
174 /* EOF
175 Emit an end-of-file token. */
176 $this->EOF();
178 } elseif($this->content_model === self::PLAINTEXT) {
179 /* When the content model flag is set to the PLAINTEXT state
180 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
181 the text and emit it as a character token. */
182 $this->emitToken(array(
183 'type' => self::CHARACTR,
184 'data' => substr($this->data, $this->char)
187 $this->EOF();
189 } else {
190 /* Anything else
191 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
192 otherwise would also be treated as a character token and emit it
193 as a single character token. Stay in the data state. */
194 $len = strcspn($this->data, '<&', $this->char);
195 $char = substr($this->data, $this->char, $len);
196 $this->char += $len - 1;
198 $this->emitToken(array(
199 'type' => self::CHARACTR,
200 'data' => $char
203 $this->state = 'data';
207 private function entityDataState() {
208 // Attempt to consume an entity.
209 $entity = $this->entity();
211 // If nothing is returned, emit a U+0026 AMPERSAND character token.
212 // Otherwise, emit the character token that was returned.
213 $char = (!$entity) ? '&' : $entity;
214 $this->emitToken($char);
216 // Finally, switch to the data state.
217 $this->state = 'data';
220 private function tagOpenState() {
221 switch($this->content_model) {
222 case self::RCDATA:
223 case self::CDATA:
224 /* If the next input character is a U+002F SOLIDUS (/) character,
225 consume it and switch to the close tag open state. If the next
226 input character is not a U+002F SOLIDUS (/) character, emit a
227 U+003C LESS-THAN SIGN character token and switch to the data
228 state to process the next input character. */
229 if($this->character($this->char + 1) === '/') {
230 $this->char++;
231 $this->state = 'closeTagOpen';
233 } else {
234 $this->emitToken(array(
235 'type' => self::CHARACTR,
236 'data' => '<'
239 $this->state = 'data';
241 break;
243 case self::PCDATA:
244 // If the content model flag is set to the PCDATA state
245 // Consume the next input character:
246 $this->char++;
247 $char = $this->char();
249 if($char === '!') {
250 /* U+0021 EXCLAMATION MARK (!)
251 Switch to the markup declaration open state. */
252 $this->state = 'markupDeclarationOpen';
254 } elseif($char === '/') {
255 /* U+002F SOLIDUS (/)
256 Switch to the close tag open state. */
257 $this->state = 'closeTagOpen';
259 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
260 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
261 Create a new start tag token, set its tag name to the lowercase
262 version of the input character (add 0x0020 to the character's code
263 point), then switch to the tag name state. (Don't emit the token
264 yet; further details will be filled in before it is emitted.) */
265 $this->token = array(
266 'name' => strtolower($char),
267 'type' => self::STARTTAG,
268 'attr' => array()
271 $this->state = 'tagName';
273 } elseif($char === '>') {
274 /* U+003E GREATER-THAN SIGN (>)
275 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
276 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
277 $this->emitToken(array(
278 'type' => self::CHARACTR,
279 'data' => '<>'
282 $this->state = 'data';
284 } elseif($char === '?') {
285 /* U+003F QUESTION MARK (?)
286 Parse error. Switch to the bogus comment state. */
287 $this->state = 'bogusComment';
289 } else {
290 /* Anything else
291 Parse error. Emit a U+003C LESS-THAN SIGN character token and
292 reconsume the current input character in the data state. */
293 $this->emitToken(array(
294 'type' => self::CHARACTR,
295 'data' => '<'
298 $this->char--;
299 $this->state = 'data';
301 break;
305 private function closeTagOpenState() {
306 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
307 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
309 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
310 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
311 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
312 /* If the content model flag is set to the RCDATA or CDATA states then
313 examine the next few characters. If they do not match the tag name of
314 the last start tag token emitted (case insensitively), or if they do but
315 they are not immediately followed by one of the following characters:
316 * U+0009 CHARACTER TABULATION
317 * U+000A LINE FEED (LF)
318 * U+000B LINE TABULATION
319 * U+000C FORM FEED (FF)
320 * U+0020 SPACE
321 * U+003E GREATER-THAN SIGN (>)
322 * U+002F SOLIDUS (/)
323 * EOF
324 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
325 token, a U+002F SOLIDUS character token, and switch to the data state
326 to process the next input character. */
327 $this->emitToken(array(
328 'type' => self::CHARACTR,
329 'data' => '</'
332 $this->state = 'data';
334 } else {
335 /* Otherwise, if the content model flag is set to the PCDATA state,
336 or if the next few characters do match that tag name, consume the
337 next input character: */
338 $this->char++;
339 $char = $this->char();
341 if(preg_match('/^[A-Za-z]$/', $char)) {
342 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
343 Create a new end tag token, set its tag name to the lowercase version
344 of the input character (add 0x0020 to the character's code point), then
345 switch to the tag name state. (Don't emit the token yet; further details
346 will be filled in before it is emitted.) */
347 $this->token = array(
348 'name' => strtolower($char),
349 'type' => self::ENDTAG
352 $this->state = 'tagName';
354 } elseif($char === '>') {
355 /* U+003E GREATER-THAN SIGN (>)
356 Parse error. Switch to the data state. */
357 $this->state = 'data';
359 } elseif($this->char === $this->EOF) {
360 /* EOF
361 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
362 SOLIDUS character token. Reconsume the EOF character in the data state. */
363 $this->emitToken(array(
364 'type' => self::CHARACTR,
365 'data' => '</'
368 $this->char--;
369 $this->state = 'data';
371 } else {
372 /* Parse error. Switch to the bogus comment state. */
373 $this->state = 'bogusComment';
378 private function tagNameState() {
379 // Consume the next input character:
380 $this->char++;
381 $char = $this->character($this->char);
383 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
384 /* U+0009 CHARACTER TABULATION
385 U+000A LINE FEED (LF)
386 U+000B LINE TABULATION
387 U+000C FORM FEED (FF)
388 U+0020 SPACE
389 Switch to the before attribute name state. */
390 $this->state = 'beforeAttributeName';
392 } elseif($char === '>') {
393 /* U+003E GREATER-THAN SIGN (>)
394 Emit the current tag token. Switch to the data state. */
395 $this->emitToken($this->token);
396 $this->state = 'data';
398 } elseif($this->char === $this->EOF) {
399 /* EOF
400 Parse error. Emit the current tag token. Reconsume the EOF
401 character in the data state. */
402 $this->emitToken($this->token);
404 $this->char--;
405 $this->state = 'data';
407 } elseif($char === '/') {
408 /* U+002F SOLIDUS (/)
409 Parse error unless this is a permitted slash. Switch to the before
410 attribute name state. */
411 $this->state = 'beforeAttributeName';
413 } else {
414 /* Anything else
415 Append the current input character to the current tag token's tag name.
416 Stay in the tag name state. */
417 $this->token['name'] .= strtolower($char);
418 $this->state = 'tagName';
422 private function beforeAttributeNameState() {
423 // Consume the next input character:
424 $this->char++;
425 $char = $this->character($this->char);
427 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
428 /* U+0009 CHARACTER TABULATION
429 U+000A LINE FEED (LF)
430 U+000B LINE TABULATION
431 U+000C FORM FEED (FF)
432 U+0020 SPACE
433 Stay in the before attribute name state. */
434 $this->state = 'beforeAttributeName';
436 } elseif($char === '>') {
437 /* U+003E GREATER-THAN SIGN (>)
438 Emit the current tag token. Switch to the data state. */
439 $this->emitToken($this->token);
440 $this->state = 'data';
442 } elseif($char === '/') {
443 /* U+002F SOLIDUS (/)
444 Parse error unless this is a permitted slash. Stay in the before
445 attribute name state. */
446 $this->state = 'beforeAttributeName';
448 } elseif($this->char === $this->EOF) {
449 /* EOF
450 Parse error. Emit the current tag token. Reconsume the EOF
451 character in the data state. */
452 $this->emitToken($this->token);
454 $this->char--;
455 $this->state = 'data';
457 } else {
458 /* Anything else
459 Start a new attribute in the current tag token. Set that attribute's
460 name to the current input character, and its value to the empty string.
461 Switch to the attribute name state. */
462 $this->token['attr'][] = array(
463 'name' => strtolower($char),
464 'value' => null
467 $this->state = 'attributeName';
471 private function attributeNameState() {
472 // Consume the next input character:
473 $this->char++;
474 $char = $this->character($this->char);
476 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
477 /* U+0009 CHARACTER TABULATION
478 U+000A LINE FEED (LF)
479 U+000B LINE TABULATION
480 U+000C FORM FEED (FF)
481 U+0020 SPACE
482 Stay in the before attribute name state. */
483 $this->state = 'afterAttributeName';
485 } elseif($char === '=') {
486 /* U+003D EQUALS SIGN (=)
487 Switch to the before attribute value state. */
488 $this->state = 'beforeAttributeValue';
490 } elseif($char === '>') {
491 /* U+003E GREATER-THAN SIGN (>)
492 Emit the current tag token. Switch to the data state. */
493 $this->emitToken($this->token);
494 $this->state = 'data';
496 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
497 /* U+002F SOLIDUS (/)
498 Parse error unless this is a permitted slash. Switch to the before
499 attribute name state. */
500 $this->state = 'beforeAttributeName';
502 } elseif($this->char === $this->EOF) {
503 /* EOF
504 Parse error. Emit the current tag token. Reconsume the EOF
505 character in the data state. */
506 $this->emitToken($this->token);
508 $this->char--;
509 $this->state = 'data';
511 } else {
512 /* Anything else
513 Append the current input character to the current attribute's name.
514 Stay in the attribute name state. */
515 $last = count($this->token['attr']) - 1;
516 $this->token['attr'][$last]['name'] .= strtolower($char);
518 $this->state = 'attributeName';
522 private function afterAttributeNameState() {
523 // Consume the next input character:
524 $this->char++;
525 $char = $this->character($this->char);
527 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
528 /* U+0009 CHARACTER TABULATION
529 U+000A LINE FEED (LF)
530 U+000B LINE TABULATION
531 U+000C FORM FEED (FF)
532 U+0020 SPACE
533 Stay in the after attribute name state. */
534 $this->state = 'afterAttributeName';
536 } elseif($char === '=') {
537 /* U+003D EQUALS SIGN (=)
538 Switch to the before attribute value state. */
539 $this->state = 'beforeAttributeValue';
541 } elseif($char === '>') {
542 /* U+003E GREATER-THAN SIGN (>)
543 Emit the current tag token. Switch to the data state. */
544 $this->emitToken($this->token);
545 $this->state = 'data';
547 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
548 /* U+002F SOLIDUS (/)
549 Parse error unless this is a permitted slash. Switch to the
550 before attribute name state. */
551 $this->state = 'beforeAttributeName';
553 } elseif($this->char === $this->EOF) {
554 /* EOF
555 Parse error. Emit the current tag token. Reconsume the EOF
556 character in the data state. */
557 $this->emitToken($this->token);
559 $this->char--;
560 $this->state = 'data';
562 } else {
563 /* Anything else
564 Start a new attribute in the current tag token. Set that attribute's
565 name to the current input character, and its value to the empty string.
566 Switch to the attribute name state. */
567 $this->token['attr'][] = array(
568 'name' => strtolower($char),
569 'value' => null
572 $this->state = 'attributeName';
576 private function beforeAttributeValueState() {
577 // Consume the next input character:
578 $this->char++;
579 $char = $this->character($this->char);
581 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
582 /* U+0009 CHARACTER TABULATION
583 U+000A LINE FEED (LF)
584 U+000B LINE TABULATION
585 U+000C FORM FEED (FF)
586 U+0020 SPACE
587 Stay in the before attribute value state. */
588 $this->state = 'beforeAttributeValue';
590 } elseif($char === '"') {
591 /* U+0022 QUOTATION MARK (")
592 Switch to the attribute value (double-quoted) state. */
593 $this->state = 'attributeValueDoubleQuoted';
595 } elseif($char === '&') {
596 /* U+0026 AMPERSAND (&)
597 Switch to the attribute value (unquoted) state and reconsume
598 this input character. */
599 $this->char--;
600 $this->state = 'attributeValueUnquoted';
602 } elseif($char === '\'') {
603 /* U+0027 APOSTROPHE (')
604 Switch to the attribute value (single-quoted) state. */
605 $this->state = 'attributeValueSingleQuoted';
607 } elseif($char === '>') {
608 /* U+003E GREATER-THAN SIGN (>)
609 Emit the current tag token. Switch to the data state. */
610 $this->emitToken($this->token);
611 $this->state = 'data';
613 } else {
614 /* Anything else
615 Append the current input character to the current attribute's value.
616 Switch to the attribute value (unquoted) state. */
617 $last = count($this->token['attr']) - 1;
618 $this->token['attr'][$last]['value'] .= $char;
620 $this->state = 'attributeValueUnquoted';
624 private function attributeValueDoubleQuotedState() {
625 // Consume the next input character:
626 $this->char++;
627 $char = $this->character($this->char);
629 if($char === '"') {
630 /* U+0022 QUOTATION MARK (")
631 Switch to the before attribute name state. */
632 $this->state = 'beforeAttributeName';
634 } elseif($char === '&') {
635 /* U+0026 AMPERSAND (&)
636 Switch to the entity in attribute value state. */
637 $this->entityInAttributeValueState('double');
639 } elseif($this->char === $this->EOF) {
640 /* EOF
641 Parse error. Emit the current tag token. Reconsume the character
642 in the data state. */
643 $this->emitToken($this->token);
645 $this->char--;
646 $this->state = 'data';
648 } else {
649 /* Anything else
650 Append the current input character to the current attribute's value.
651 Stay in the attribute value (double-quoted) state. */
652 $last = count($this->token['attr']) - 1;
653 $this->token['attr'][$last]['value'] .= $char;
655 $this->state = 'attributeValueDoubleQuoted';
659 private function attributeValueSingleQuotedState() {
660 // Consume the next input character:
661 $this->char++;
662 $char = $this->character($this->char);
664 if($char === '\'') {
665 /* U+0022 QUOTATION MARK (')
666 Switch to the before attribute name state. */
667 $this->state = 'beforeAttributeName';
669 } elseif($char === '&') {
670 /* U+0026 AMPERSAND (&)
671 Switch to the entity in attribute value state. */
672 $this->entityInAttributeValueState('single');
674 } elseif($this->char === $this->EOF) {
675 /* EOF
676 Parse error. Emit the current tag token. Reconsume the character
677 in the data state. */
678 $this->emitToken($this->token);
680 $this->char--;
681 $this->state = 'data';
683 } else {
684 /* Anything else
685 Append the current input character to the current attribute's value.
686 Stay in the attribute value (single-quoted) state. */
687 $last = count($this->token['attr']) - 1;
688 $this->token['attr'][$last]['value'] .= $char;
690 $this->state = 'attributeValueSingleQuoted';
694 private function attributeValueUnquotedState() {
695 // Consume the next input character:
696 $this->char++;
697 $char = $this->character($this->char);
699 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
700 /* U+0009 CHARACTER TABULATION
701 U+000A LINE FEED (LF)
702 U+000B LINE TABULATION
703 U+000C FORM FEED (FF)
704 U+0020 SPACE
705 Switch to the before attribute name state. */
706 $this->state = 'beforeAttributeName';
708 } elseif($char === '&') {
709 /* U+0026 AMPERSAND (&)
710 Switch to the entity in attribute value state. */
711 $this->entityInAttributeValueState('non');
713 } elseif($char === '>') {
714 /* U+003E GREATER-THAN SIGN (>)
715 Emit the current tag token. Switch to the data state. */
716 $this->emitToken($this->token);
717 $this->state = 'data';
719 } else {
720 /* Anything else
721 Append the current input character to the current attribute's value.
722 Stay in the attribute value (unquoted) state. */
723 $last = count($this->token['attr']) - 1;
724 $this->token['attr'][$last]['value'] .= $char;
726 $this->state = 'attributeValueUnquoted';
730 private function entityInAttributeValueState() {
731 // Attempt to consume an entity.
732 $entity = $this->entity();
734 // If nothing is returned, append a U+0026 AMPERSAND character to the
735 // current attribute's value. Otherwise, emit the character token that
736 // was returned.
737 $char = (!$entity)
738 ? '&'
739 : $entity;
741 $this->emitToken($char);
744 private function bogusCommentState() {
745 /* Consume every character up to the first U+003E GREATER-THAN SIGN
746 character (>) or the end of the file (EOF), whichever comes first. Emit
747 a comment token whose data is the concatenation of all the characters
748 starting from and including the character that caused the state machine
749 to switch into the bogus comment state, up to and including the last
750 consumed character before the U+003E character, if any, or up to the
751 end of the file otherwise. (If the comment was started by the end of
752 the file (EOF), the token is empty.) */
753 $data = $this->characters('^>', $this->char);
754 $this->emitToken(array(
755 'data' => $data,
756 'type' => self::COMMENT
759 $this->char += strlen($data);
761 /* Switch to the data state. */
762 $this->state = 'data';
764 /* If the end of the file was reached, reconsume the EOF character. */
765 if($this->char === $this->EOF) {
766 $this->char = $this->EOF - 1;
770 private function markupDeclarationOpenState() {
771 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
772 characters, consume those two characters, create a comment token whose
773 data is the empty string, and switch to the comment state. */
774 if($this->character($this->char + 1, 2) === '--') {
775 $this->char += 2;
776 $this->state = 'comment';
777 $this->token = array(
778 'data' => null,
779 'type' => self::COMMENT
782 /* Otherwise if the next seven chacacters are a case-insensitive match
783 for the word "DOCTYPE", then consume those characters and switch to the
784 DOCTYPE state. */
785 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
786 $this->char += 7;
787 $this->state = 'doctype';
789 /* Otherwise, is is a parse error. Switch to the bogus comment state.
790 The next character that is consumed, if any, is the first character
791 that will be in the comment. */
792 } else {
793 $this->char++;
794 $this->state = 'bogusComment';
798 private function commentState() {
799 /* Consume the next input character: */
800 $this->char++;
801 $char = $this->char();
803 /* U+002D HYPHEN-MINUS (-) */
804 if($char === '-') {
805 /* Switch to the comment dash state */
806 $this->state = 'commentDash';
808 /* EOF */
809 } elseif($this->char === $this->EOF) {
810 /* Parse error. Emit the comment token. Reconsume the EOF character
811 in the data state. */
812 $this->emitToken($this->token);
813 $this->char--;
814 $this->state = 'data';
816 /* Anything else */
817 } else {
818 /* Append the input character to the comment token's data. Stay in
819 the comment state. */
820 $this->token['data'] .= $char;
824 private function commentDashState() {
825 /* Consume the next input character: */
826 $this->char++;
827 $char = $this->char();
829 /* U+002D HYPHEN-MINUS (-) */
830 if($char === '-') {
831 /* Switch to the comment end state */
832 $this->state = 'commentEnd';
834 /* EOF */
835 } elseif($this->char === $this->EOF) {
836 /* Parse error. Emit the comment token. Reconsume the EOF character
837 in the data state. */
838 $this->emitToken($this->token);
839 $this->char--;
840 $this->state = 'data';
842 /* Anything else */
843 } else {
844 /* Append a U+002D HYPHEN-MINUS (-) character and the input
845 character to the comment token's data. Switch to the comment state. */
846 $this->token['data'] .= '-'.$char;
847 $this->state = 'comment';
851 private function commentEndState() {
852 /* Consume the next input character: */
853 $this->char++;
854 $char = $this->char();
856 if($char === '>') {
857 $this->emitToken($this->token);
858 $this->state = 'data';
860 } elseif($char === '-') {
861 $this->token['data'] .= '-';
863 } elseif($this->char === $this->EOF) {
864 $this->emitToken($this->token);
865 $this->char--;
866 $this->state = 'data';
868 } else {
869 $this->token['data'] .= '--'.$char;
870 $this->state = 'comment';
874 private function doctypeState() {
875 /* Consume the next input character: */
876 $this->char++;
877 $char = $this->char();
879 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
880 $this->state = 'beforeDoctypeName';
882 } else {
883 $this->char--;
884 $this->state = 'beforeDoctypeName';
888 private function beforeDoctypeNameState() {
889 /* Consume the next input character: */
890 $this->char++;
891 $char = $this->char();
893 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
894 // Stay in the before DOCTYPE name state.
896 } elseif(preg_match('/^[a-z]$/', $char)) {
897 $this->token = array(
898 'name' => strtoupper($char),
899 'type' => self::DOCTYPE,
900 'error' => true
903 $this->state = 'doctypeName';
905 } elseif($char === '>') {
906 $this->emitToken(array(
907 'name' => null,
908 'type' => self::DOCTYPE,
909 'error' => true
912 $this->state = 'data';
914 } elseif($this->char === $this->EOF) {
915 $this->emitToken(array(
916 'name' => null,
917 'type' => self::DOCTYPE,
918 'error' => true
921 $this->char--;
922 $this->state = 'data';
924 } else {
925 $this->token = array(
926 'name' => $char,
927 'type' => self::DOCTYPE,
928 'error' => true
931 $this->state = 'doctypeName';
935 private function doctypeNameState() {
936 /* Consume the next input character: */
937 $this->char++;
938 $char = $this->char();
940 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
941 $this->state = 'AfterDoctypeName';
943 } elseif($char === '>') {
944 $this->emitToken($this->token);
945 $this->state = 'data';
947 } elseif(preg_match('/^[a-z]$/', $char)) {
948 $this->token['name'] .= strtoupper($char);
950 } elseif($this->char === $this->EOF) {
951 $this->emitToken($this->token);
952 $this->char--;
953 $this->state = 'data';
955 } else {
956 $this->token['name'] .= $char;
959 $this->token['error'] = ($this->token['name'] === 'HTML')
960 ? false
961 : true;
964 private function afterDoctypeNameState() {
965 /* Consume the next input character: */
966 $this->char++;
967 $char = $this->char();
969 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
970 // Stay in the DOCTYPE name state.
972 } elseif($char === '>') {
973 $this->emitToken($this->token);
974 $this->state = 'data';
976 } elseif($this->char === $this->EOF) {
977 $this->emitToken($this->token);
978 $this->char--;
979 $this->state = 'data';
981 } else {
982 $this->token['error'] = true;
983 $this->state = 'bogusDoctype';
987 private function bogusDoctypeState() {
988 /* Consume the next input character: */
989 $this->char++;
990 $char = $this->char();
992 if($char === '>') {
993 $this->emitToken($this->token);
994 $this->state = 'data';
996 } elseif($this->char === $this->EOF) {
997 $this->emitToken($this->token);
998 $this->char--;
999 $this->state = 'data';
1001 } else {
1002 // Stay in the bogus DOCTYPE state.
1006 private function entity() {
1007 $start = $this->char;
1009 // This section defines how to consume an entity. This definition is
1010 // used when parsing entities in text and in attributes.
1012 // The behaviour depends on the identity of the next character (the
1013 // one immediately after the U+0026 AMPERSAND character):
1015 switch($this->character($this->char + 1)) {
1016 // U+0023 NUMBER SIGN (#)
1017 case '#':
1019 // The behaviour further depends on the character after the
1020 // U+0023 NUMBER SIGN:
1021 switch($this->character($this->char + 1)) {
1022 // U+0078 LATIN SMALL LETTER X
1023 // U+0058 LATIN CAPITAL LETTER X
1024 case 'x':
1025 case 'X':
1026 // Follow the steps below, but using the range of
1027 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1028 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1029 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1030 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1031 // words, 0-9, A-F, a-f).
1032 $char = 1;
1033 $char_class = '0-9A-Fa-f';
1034 break;
1036 // Anything else
1037 default:
1038 // Follow the steps below, but using the range of
1039 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1040 // NINE (i.e. just 0-9).
1041 $char = 0;
1042 $char_class = '0-9';
1043 break;
1046 // Consume as many characters as match the range of characters
1047 // given above.
1048 $this->char++;
1049 $e_name = $this->characters($char_class, $this->char + $char + 1);
1050 $entity = $this->character($start, $this->char);
1051 $cond = strlen($e_name) > 0;
1053 // The rest of the parsing happens bellow.
1054 break;
1056 // Anything else
1057 default:
1058 // Consume the maximum number of characters possible, with the
1059 // consumed characters case-sensitively matching one of the
1060 // identifiers in the first column of the entities table.
1061 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1062 $len = strlen($e_name);
1064 for($c = 1; $c <= $len; $c++) {
1065 $id = substr($e_name, 0, $c);
1066 $this->char++;
1068 if(in_array($id, $this->entities)) {
1069 $entity = $id;
1070 break;
1074 $cond = isset($entity);
1075 // The rest of the parsing happens bellow.
1076 break;
1079 if(!$cond) {
1080 // If no match can be made, then this is a parse error. No
1081 // characters are consumed, and nothing is returned.
1082 $this->char = $start;
1083 return false;
1086 // Return a character token for the character corresponding to the
1087 // entity name (as given by the second column of the entities table).
1088 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1091 private function emitToken($token) {
1092 $emit = $this->tree->emitToken($token);
1094 if(is_int($emit)) {
1095 $this->content_model = $emit;
1097 } elseif($token['type'] === self::ENDTAG) {
1098 $this->content_model = self::PCDATA;
1102 private function EOF() {
1103 $this->state = null;
1104 $this->tree->emitToken(array(
1105 'type' => self::EOF
1110 class HTML5TreeConstructer {
1111 public $stack = array();
1113 private $phase;
1114 private $mode;
1115 private $dom;
1116 private $foster_parent = null;
1117 private $a_formatting = array();
1119 private $head_pointer = null;
1120 private $form_pointer = null;
1122 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1123 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1124 private $special = array('address','area','base','basefont','bgsound',
1125 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1126 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1127 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1128 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1129 'option','p','param','plaintext','pre','script','select','spacer','style',
1130 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1132 // The different phases.
1133 const INIT_PHASE = 0;
1134 const ROOT_PHASE = 1;
1135 const MAIN_PHASE = 2;
1136 const END_PHASE = 3;
1138 // The different insertion modes for the main phase.
1139 const BEFOR_HEAD = 0;
1140 const IN_HEAD = 1;
1141 const AFTER_HEAD = 2;
1142 const IN_BODY = 3;
1143 const IN_TABLE = 4;
1144 const IN_CAPTION = 5;
1145 const IN_CGROUP = 6;
1146 const IN_TBODY = 7;
1147 const IN_ROW = 8;
1148 const IN_CELL = 9;
1149 const IN_SELECT = 10;
1150 const AFTER_BODY = 11;
1151 const IN_FRAME = 12;
1152 const AFTR_FRAME = 13;
1154 // The different types of elements.
1155 const SPECIAL = 0;
1156 const SCOPING = 1;
1157 const FORMATTING = 2;
1158 const PHRASING = 3;
1160 const MARKER = 0;
1162 public function __construct() {
1163 $this->phase = self::INIT_PHASE;
1164 $this->mode = self::BEFOR_HEAD;
1165 $this->dom = new DOMDocument;
1167 $this->dom->encoding = 'UTF-8';
1168 $this->dom->preserveWhiteSpace = true;
1169 $this->dom->substituteEntities = true;
1170 $this->dom->strictErrorChecking = false;
1173 // Process tag tokens
1174 public function emitToken($token) {
1175 switch($this->phase) {
1176 case self::INIT_PHASE: return $this->initPhase($token); break;
1177 case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1178 case self::MAIN_PHASE: return $this->mainPhase($token); break;
1179 case self::END_PHASE : return $this->trailingEndPhase($token); break;
1183 private function initPhase($token) {
1184 /* Initially, the tree construction stage must handle each token
1185 emitted from the tokenisation stage as follows: */
1187 /* A DOCTYPE token that is marked as being in error
1188 A comment token
1189 A start tag token
1190 An end tag token
1191 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1192 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1193 or U+0020 SPACE
1194 An end-of-file token */
1195 if((isset($token['error']) && $token['error']) ||
1196 $token['type'] === HTML5::COMMENT ||
1197 $token['type'] === HTML5::STARTTAG ||
1198 $token['type'] === HTML5::ENDTAG ||
1199 $token['type'] === HTML5::EOF ||
1200 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1201 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1202 /* This specification does not define how to handle this case. In
1203 particular, user agents may ignore the entirety of this specification
1204 altogether for such documents, and instead invoke special parse modes
1205 with a greater emphasis on backwards compatibility. */
1207 $this->phase = self::ROOT_PHASE;
1208 return $this->rootElementPhase($token);
1210 /* A DOCTYPE token marked as being correct */
1211 } elseif(isset($token['error']) && !$token['error']) {
1212 /* Append a DocumentType node to the Document node, with the name
1213 attribute set to the name given in the DOCTYPE token (which will be
1214 "HTML"), and the other attributes specific to DocumentType objects
1215 set to null, empty lists, or the empty string as appropriate. */
1216 $doctype = new DOMDocumentType(null, null, 'HTML');
1218 /* Then, switch to the root element phase of the tree construction
1219 stage. */
1220 $this->phase = self::ROOT_PHASE;
1222 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1223 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1224 or U+0020 SPACE */
1225 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1226 $token['data'])) {
1227 /* Append that character to the Document node. */
1228 $text = $this->dom->createTextNode($token['data']);
1229 $this->dom->appendChild($text);
1233 private function rootElementPhase($token) {
1234 /* After the initial phase, as each token is emitted from the tokenisation
1235 stage, it must be processed as described in this section. */
1237 /* A DOCTYPE token */
1238 if($token['type'] === HTML5::DOCTYPE) {
1239 // Parse error. Ignore the token.
1241 /* A comment token */
1242 } elseif($token['type'] === HTML5::COMMENT) {
1243 /* Append a Comment node to the Document object with the data
1244 attribute set to the data given in the comment token. */
1245 $comment = $this->dom->createComment($token['data']);
1246 $this->dom->appendChild($comment);
1248 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1249 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1250 or U+0020 SPACE */
1251 } elseif($token['type'] === HTML5::CHARACTR &&
1252 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1253 /* Append that character to the Document node. */
1254 $text = $this->dom->createTextNode($token['data']);
1255 $this->dom->appendChild($text);
1257 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1258 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1259 (FF), or U+0020 SPACE
1260 A start tag token
1261 An end tag token
1262 An end-of-file token */
1263 } elseif(($token['type'] === HTML5::CHARACTR &&
1264 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1265 $token['type'] === HTML5::STARTTAG ||
1266 $token['type'] === HTML5::ENDTAG ||
1267 $token['type'] === HTML5::EOF) {
1268 /* Create an HTMLElement node with the tag name html, in the HTML
1269 namespace. Append it to the Document object. Switch to the main
1270 phase and reprocess the current token. */
1271 $html = $this->dom->createElement('html');
1272 $this->dom->appendChild($html);
1273 $this->stack[] = $html;
1275 $this->phase = self::MAIN_PHASE;
1276 return $this->mainPhase($token);
1280 private function mainPhase($token) {
1281 /* Tokens in the main phase must be handled as follows: */
1283 /* A DOCTYPE token */
1284 if($token['type'] === HTML5::DOCTYPE) {
1285 // Parse error. Ignore the token.
1287 /* A start tag token with the tag name "html" */
1288 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1289 /* If this start tag token was not the first start tag token, then
1290 it is a parse error. */
1292 /* For each attribute on the token, check to see if the attribute
1293 is already present on the top element of the stack of open elements.
1294 If it is not, add the attribute and its corresponding value to that
1295 element. */
1296 foreach($token['attr'] as $attr) {
1297 if(!$this->stack[0]->hasAttribute($attr['name'])) {
1298 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1302 /* An end-of-file token */
1303 } elseif($token['type'] === HTML5::EOF) {
1304 /* Generate implied end tags. */
1305 $this->generateImpliedEndTags();
1307 /* Anything else. */
1308 } else {
1309 /* Depends on the insertion mode: */
1310 switch($this->mode) {
1311 case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1312 case self::IN_HEAD: return $this->inHead($token); break;
1313 case self::AFTER_HEAD: return $this->afterHead($token); break;
1314 case self::IN_BODY: return $this->inBody($token); break;
1315 case self::IN_TABLE: return $this->inTable($token); break;
1316 case self::IN_CAPTION: return $this->inCaption($token); break;
1317 case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1318 case self::IN_TBODY: return $this->inTableBody($token); break;
1319 case self::IN_ROW: return $this->inRow($token); break;
1320 case self::IN_CELL: return $this->inCell($token); break;
1321 case self::IN_SELECT: return $this->inSelect($token); break;
1322 case self::AFTER_BODY: return $this->afterBody($token); break;
1323 case self::IN_FRAME: return $this->inFrameset($token); break;
1324 case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1325 case self::END_PHASE: return $this->trailingEndPhase($token); break;
1330 private function beforeHead($token) {
1331 /* Handle the token as follows: */
1333 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1334 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1335 or U+0020 SPACE */
1336 if($token['type'] === HTML5::CHARACTR &&
1337 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1338 /* Append the character to the current node. */
1339 $this->insertText($token['data']);
1341 /* A comment token */
1342 } elseif($token['type'] === HTML5::COMMENT) {
1343 /* Append a Comment node to the current node with the data attribute
1344 set to the data given in the comment token. */
1345 $this->insertComment($token['data']);
1347 /* A start tag token with the tag name "head" */
1348 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1349 /* Create an element for the token, append the new element to the
1350 current node and push it onto the stack of open elements. */
1351 $element = $this->insertElement($token);
1353 /* Set the head element pointer to this new element node. */
1354 $this->head_pointer = $element;
1356 /* Change the insertion mode to "in head". */
1357 $this->mode = self::IN_HEAD;
1359 /* A start tag token whose tag name is one of: "base", "link", "meta",
1360 "script", "style", "title". Or an end tag with the tag name "html".
1361 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1362 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1363 or U+0020 SPACE. Or any other start tag token */
1364 } elseif($token['type'] === HTML5::STARTTAG ||
1365 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1366 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1367 $token['data']))) {
1368 /* Act as if a start tag token with the tag name "head" and no
1369 attributes had been seen, then reprocess the current token. */
1370 $this->beforeHead(array(
1371 'name' => 'head',
1372 'type' => HTML5::STARTTAG,
1373 'attr' => array()
1376 return $this->inHead($token);
1378 /* Any other end tag */
1379 } elseif($token['type'] === HTML5::ENDTAG) {
1380 /* Parse error. Ignore the token. */
1384 private function inHead($token) {
1385 /* Handle the token as follows: */
1387 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1388 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1389 or U+0020 SPACE.
1391 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1392 or script element, append the character to the current node regardless
1393 of its content. */
1394 if(($token['type'] === HTML5::CHARACTR &&
1395 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1396 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1397 array('title', 'style', 'script')))) {
1398 /* Append the character to the current node. */
1399 $this->insertText($token['data']);
1401 /* A comment token */
1402 } elseif($token['type'] === HTML5::COMMENT) {
1403 /* Append a Comment node to the current node with the data attribute
1404 set to the data given in the comment token. */
1405 $this->insertComment($token['data']);
1407 } elseif($token['type'] === HTML5::ENDTAG &&
1408 in_array($token['name'], array('title', 'style', 'script'))) {
1409 array_pop($this->stack);
1410 return HTML5::PCDATA;
1412 /* A start tag with the tag name "title" */
1413 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1414 /* Create an element for the token and append the new element to the
1415 node pointed to by the head element pointer, or, if that is null
1416 (innerHTML case), to the current node. */
1417 if($this->head_pointer !== null) {
1418 $element = $this->insertElement($token, false);
1419 $this->head_pointer->appendChild($element);
1421 } else {
1422 $element = $this->insertElement($token);
1425 /* Switch the tokeniser's content model flag to the RCDATA state. */
1426 return HTML5::RCDATA;
1428 /* A start tag with the tag name "style" */
1429 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1430 /* Create an element for the token and append the new element to the
1431 node pointed to by the head element pointer, or, if that is null
1432 (innerHTML case), to the current node. */
1433 if($this->head_pointer !== null) {
1434 $element = $this->insertElement($token, false);
1435 $this->head_pointer->appendChild($element);
1437 } else {
1438 $this->insertElement($token);
1441 /* Switch the tokeniser's content model flag to the CDATA state. */
1442 return HTML5::CDATA;
1444 /* A start tag with the tag name "script" */
1445 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1446 /* Create an element for the token. */
1447 $element = $this->insertElement($token, false);
1448 $this->head_pointer->appendChild($element);
1450 /* Switch the tokeniser's content model flag to the CDATA state. */
1451 return HTML5::CDATA;
1453 /* A start tag with the tag name "base", "link", or "meta" */
1454 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1455 array('base', 'link', 'meta'))) {
1456 /* Create an element for the token and append the new element to the
1457 node pointed to by the head element pointer, or, if that is null
1458 (innerHTML case), to the current node. */
1459 if($this->head_pointer !== null) {
1460 $element = $this->insertElement($token, false);
1461 $this->head_pointer->appendChild($element);
1462 array_pop($this->stack);
1464 } else {
1465 $this->insertElement($token);
1468 /* An end tag with the tag name "head" */
1469 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1470 /* If the current node is a head element, pop the current node off
1471 the stack of open elements. */
1472 if($this->head_pointer->isSameNode(end($this->stack))) {
1473 array_pop($this->stack);
1475 /* Otherwise, this is a parse error. */
1476 } else {
1477 // k
1480 /* Change the insertion mode to "after head". */
1481 $this->mode = self::AFTER_HEAD;
1483 /* A start tag with the tag name "head" or an end tag except "html". */
1484 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1485 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1486 // Parse error. Ignore the token.
1488 /* Anything else */
1489 } else {
1490 /* If the current node is a head element, act as if an end tag
1491 token with the tag name "head" had been seen. */
1492 if($this->head_pointer->isSameNode(end($this->stack))) {
1493 $this->inHead(array(
1494 'name' => 'head',
1495 'type' => HTML5::ENDTAG
1498 /* Otherwise, change the insertion mode to "after head". */
1499 } else {
1500 $this->mode = self::AFTER_HEAD;
1503 /* Then, reprocess the current token. */
1504 return $this->afterHead($token);
1508 private function afterHead($token) {
1509 /* Handle the token as follows: */
1511 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1512 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1513 or U+0020 SPACE */
1514 if($token['type'] === HTML5::CHARACTR &&
1515 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1516 /* Append the character to the current node. */
1517 $this->insertText($token['data']);
1519 /* A comment token */
1520 } elseif($token['type'] === HTML5::COMMENT) {
1521 /* Append a Comment node to the current node with the data attribute
1522 set to the data given in the comment token. */
1523 $this->insertComment($token['data']);
1525 /* A start tag token with the tag name "body" */
1526 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1527 /* Insert a body element for the token. */
1528 $this->insertElement($token);
1530 /* Change the insertion mode to "in body". */
1531 $this->mode = self::IN_BODY;
1533 /* A start tag token with the tag name "frameset" */
1534 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1535 /* Insert a frameset element for the token. */
1536 $this->insertElement($token);
1538 /* Change the insertion mode to "in frameset". */
1539 $this->mode = self::IN_FRAME;
1541 /* A start tag token whose tag name is one of: "base", "link", "meta",
1542 "script", "style", "title" */
1543 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1544 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1545 /* Parse error. Switch the insertion mode back to "in head" and
1546 reprocess the token. */
1547 $this->mode = self::IN_HEAD;
1548 return $this->inHead($token);
1550 /* Anything else */
1551 } else {
1552 /* Act as if a start tag token with the tag name "body" and no
1553 attributes had been seen, and then reprocess the current token. */
1554 $this->afterHead(array(
1555 'name' => 'body',
1556 'type' => HTML5::STARTTAG,
1557 'attr' => array()
1560 return $this->inBody($token);
1564 private function inBody($token) {
1565 /* Handle the token as follows: */
1567 switch($token['type']) {
1568 /* A character token */
1569 case HTML5::CHARACTR:
1570 /* Reconstruct the active formatting elements, if any. */
1571 $this->reconstructActiveFormattingElements();
1573 /* Append the token's character to the current node. */
1574 $this->insertText($token['data']);
1575 break;
1577 /* A comment token */
1578 case HTML5::COMMENT:
1579 /* Append a Comment node to the current node with the data
1580 attribute set to the data given in the comment token. */
1581 $this->insertComment($token['data']);
1582 break;
1584 case HTML5::STARTTAG:
1585 switch($token['name']) {
1586 /* A start tag token whose tag name is one of: "script",
1587 "style" */
1588 case 'script': case 'style':
1589 /* Process the token as if the insertion mode had been "in
1590 head". */
1591 return $this->inHead($token);
1592 break;
1594 /* A start tag token whose tag name is one of: "base", "link",
1595 "meta", "title" */
1596 case 'base': case 'link': case 'meta': case 'title':
1597 /* Parse error. Process the token as if the insertion mode
1598 had been "in head". */
1599 return $this->inHead($token);
1600 break;
1602 /* A start tag token with the tag name "body" */
1603 case 'body':
1604 /* Parse error. If the second element on the stack of open
1605 elements is not a body element, or, if the stack of open
1606 elements has only one node on it, then ignore the token.
1607 (innerHTML case) */
1608 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1609 // Ignore
1611 /* Otherwise, for each attribute on the token, check to see
1612 if the attribute is already present on the body element (the
1613 second element) on the stack of open elements. If it is not,
1614 add the attribute and its corresponding value to that
1615 element. */
1616 } else {
1617 foreach($token['attr'] as $attr) {
1618 if(!$this->stack[1]->hasAttribute($attr['name'])) {
1619 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1623 break;
1625 /* A start tag whose tag name is one of: "address",
1626 "blockquote", "center", "dir", "div", "dl", "fieldset",
1627 "listing", "menu", "ol", "p", "ul" */
1628 case 'address': case 'blockquote': case 'center': case 'dir':
1629 case 'div': case 'dl': case 'fieldset': case 'listing':
1630 case 'menu': case 'ol': case 'p': case 'ul':
1631 /* If the stack of open elements has a p element in scope,
1632 then act as if an end tag with the tag name p had been
1633 seen. */
1634 if($this->elementInScope('p')) {
1635 $this->emitToken(array(
1636 'name' => 'p',
1637 'type' => HTML5::ENDTAG
1641 /* Insert an HTML element for the token. */
1642 $this->insertElement($token);
1643 break;
1645 /* A start tag whose tag name is "form" */
1646 case 'form':
1647 /* If the form element pointer is not null, ignore the
1648 token with a parse error. */
1649 if($this->form_pointer !== null) {
1650 // Ignore.
1652 /* Otherwise: */
1653 } else {
1654 /* If the stack of open elements has a p element in
1655 scope, then act as if an end tag with the tag name p
1656 had been seen. */
1657 if($this->elementInScope('p')) {
1658 $this->emitToken(array(
1659 'name' => 'p',
1660 'type' => HTML5::ENDTAG
1664 /* Insert an HTML element for the token, and set the
1665 form element pointer to point to the element created. */
1666 $element = $this->insertElement($token);
1667 $this->form_pointer = $element;
1669 break;
1671 /* A start tag whose tag name is "li", "dd" or "dt" */
1672 case 'li': case 'dd': case 'dt':
1673 /* If the stack of open elements has a p element in scope,
1674 then act as if an end tag with the tag name p had been
1675 seen. */
1676 if($this->elementInScope('p')) {
1677 $this->emitToken(array(
1678 'name' => 'p',
1679 'type' => HTML5::ENDTAG
1683 $stack_length = count($this->stack) - 1;
1685 for($n = $stack_length; 0 <= $n; $n--) {
1686 /* 1. Initialise node to be the current node (the
1687 bottommost node of the stack). */
1688 $stop = false;
1689 $node = $this->stack[$n];
1690 $cat = $this->getElementCategory($node->tagName);
1692 /* 2. If node is an li, dd or dt element, then pop all
1693 the nodes from the current node up to node, including
1694 node, then stop this algorithm. */
1695 if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1696 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1697 for($x = $stack_length; $x >= $n ; $x--) {
1698 array_pop($this->stack);
1701 break;
1704 /* 3. If node is not in the formatting category, and is
1705 not in the phrasing category, and is not an address or
1706 div element, then stop this algorithm. */
1707 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1708 $node->tagName !== 'address' && $node->tagName !== 'div') {
1709 break;
1713 /* Finally, insert an HTML element with the same tag
1714 name as the token's. */
1715 $this->insertElement($token);
1716 break;
1718 /* A start tag token whose tag name is "plaintext" */
1719 case 'plaintext':
1720 /* If the stack of open elements has a p element in scope,
1721 then act as if an end tag with the tag name p had been
1722 seen. */
1723 if($this->elementInScope('p')) {
1724 $this->emitToken(array(
1725 'name' => 'p',
1726 'type' => HTML5::ENDTAG
1730 /* Insert an HTML element for the token. */
1731 $this->insertElement($token);
1733 return HTML5::PLAINTEXT;
1734 break;
1736 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1737 "h5", "h6" */
1738 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1739 /* If the stack of open elements has a p element in scope,
1740 then act as if an end tag with the tag name p had been seen. */
1741 if($this->elementInScope('p')) {
1742 $this->emitToken(array(
1743 'name' => 'p',
1744 'type' => HTML5::ENDTAG
1748 /* If the stack of open elements has in scope an element whose
1749 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1750 this is a parse error; pop elements from the stack until an
1751 element with one of those tag names has been popped from the
1752 stack. */
1753 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1754 array_pop($this->stack);
1757 /* Insert an HTML element for the token. */
1758 $this->insertElement($token);
1759 break;
1761 /* A start tag whose tag name is "a" */
1762 case 'a':
1763 /* If the list of active formatting elements contains
1764 an element whose tag name is "a" between the end of the
1765 list and the last marker on the list (or the start of
1766 the list if there is no marker on the list), then this
1767 is a parse error; act as if an end tag with the tag name
1768 "a" had been seen, then remove that element from the list
1769 of active formatting elements and the stack of open
1770 elements if the end tag didn't already remove it (it
1771 might not have if the element is not in table scope). */
1772 $leng = count($this->a_formatting);
1774 for($n = $leng - 1; $n >= 0; $n--) {
1775 if($this->a_formatting[$n] === self::MARKER) {
1776 break;
1778 } elseif($this->a_formatting[$n]->nodeName === 'a') {
1779 $this->emitToken(array(
1780 'name' => 'a',
1781 'type' => HTML5::ENDTAG
1783 break;
1787 /* Reconstruct the active formatting elements, if any. */
1788 $this->reconstructActiveFormattingElements();
1790 /* Insert an HTML element for the token. */
1791 $el = $this->insertElement($token);
1793 /* Add that element to the list of active formatting
1794 elements. */
1795 $this->a_formatting[] = $el;
1796 break;
1798 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1799 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1800 case 'b': case 'big': case 'em': case 'font': case 'i':
1801 case 'nobr': case 's': case 'small': case 'strike':
1802 case 'strong': case 'tt': case 'u':
1803 /* Reconstruct the active formatting elements, if any. */
1804 $this->reconstructActiveFormattingElements();
1806 /* Insert an HTML element for the token. */
1807 $el = $this->insertElement($token);
1809 /* Add that element to the list of active formatting
1810 elements. */
1811 $this->a_formatting[] = $el;
1812 break;
1814 /* A start tag token whose tag name is "button" */
1815 case 'button':
1816 /* If the stack of open elements has a button element in scope,
1817 then this is a parse error; act as if an end tag with the tag
1818 name "button" had been seen, then reprocess the token. (We don't
1819 do that. Unnecessary.) */
1820 if($this->elementInScope('button')) {
1821 $this->inBody(array(
1822 'name' => 'button',
1823 'type' => HTML5::ENDTAG
1827 /* Reconstruct the active formatting elements, if any. */
1828 $this->reconstructActiveFormattingElements();
1830 /* Insert an HTML element for the token. */
1831 $this->insertElement($token);
1833 /* Insert a marker at the end of the list of active
1834 formatting elements. */
1835 $this->a_formatting[] = self::MARKER;
1836 break;
1838 /* A start tag token whose tag name is one of: "marquee", "object" */
1839 case 'marquee': case 'object':
1840 /* Reconstruct the active formatting elements, if any. */
1841 $this->reconstructActiveFormattingElements();
1843 /* Insert an HTML element for the token. */
1844 $this->insertElement($token);
1846 /* Insert a marker at the end of the list of active
1847 formatting elements. */
1848 $this->a_formatting[] = self::MARKER;
1849 break;
1851 /* A start tag token whose tag name is "xmp" */
1852 case 'xmp':
1853 /* Reconstruct the active formatting elements, if any. */
1854 $this->reconstructActiveFormattingElements();
1856 /* Insert an HTML element for the token. */
1857 $this->insertElement($token);
1859 /* Switch the content model flag to the CDATA state. */
1860 return HTML5::CDATA;
1861 break;
1863 /* A start tag whose tag name is "table" */
1864 case 'table':
1865 /* If the stack of open elements has a p element in scope,
1866 then act as if an end tag with the tag name p had been seen. */
1867 if($this->elementInScope('p')) {
1868 $this->emitToken(array(
1869 'name' => 'p',
1870 'type' => HTML5::ENDTAG
1874 /* Insert an HTML element for the token. */
1875 $this->insertElement($token);
1877 /* Change the insertion mode to "in table". */
1878 $this->mode = self::IN_TABLE;
1879 break;
1881 /* A start tag whose tag name is one of: "area", "basefont",
1882 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1883 case 'area': case 'basefont': case 'bgsound': case 'br':
1884 case 'embed': case 'img': case 'param': case 'spacer':
1885 case 'wbr':
1886 /* Reconstruct the active formatting elements, if any. */
1887 $this->reconstructActiveFormattingElements();
1889 /* Insert an HTML element for the token. */
1890 $this->insertElement($token);
1892 /* Immediately pop the current node off the stack of open elements. */
1893 array_pop($this->stack);
1894 break;
1896 /* A start tag whose tag name is "hr" */
1897 case 'hr':
1898 /* If the stack of open elements has a p element in scope,
1899 then act as if an end tag with the tag name p had been seen. */
1900 if($this->elementInScope('p')) {
1901 $this->emitToken(array(
1902 'name' => 'p',
1903 'type' => HTML5::ENDTAG
1907 /* Insert an HTML element for the token. */
1908 $this->insertElement($token);
1910 /* Immediately pop the current node off the stack of open elements. */
1911 array_pop($this->stack);
1912 break;
1914 /* A start tag whose tag name is "image" */
1915 case 'image':
1916 /* Parse error. Change the token's tag name to "img" and
1917 reprocess it. (Don't ask.) */
1918 $token['name'] = 'img';
1919 return $this->inBody($token);
1920 break;
1922 /* A start tag whose tag name is "input" */
1923 case 'input':
1924 /* Reconstruct the active formatting elements, if any. */
1925 $this->reconstructActiveFormattingElements();
1927 /* Insert an input element for the token. */
1928 $element = $this->insertElement($token, false);
1930 /* If the form element pointer is not null, then associate the
1931 input element with the form element pointed to by the form
1932 element pointer. */
1933 $this->form_pointer !== null
1934 ? $this->form_pointer->appendChild($element)
1935 : end($this->stack)->appendChild($element);
1937 /* Pop that input element off the stack of open elements. */
1938 array_pop($this->stack);
1939 break;
1941 /* A start tag whose tag name is "isindex" */
1942 case 'isindex':
1943 /* Parse error. */
1944 // w/e
1946 /* If the form element pointer is not null,
1947 then ignore the token. */
1948 if($this->form_pointer === null) {
1949 /* Act as if a start tag token with the tag name "form" had
1950 been seen. */
1951 $this->inBody(array(
1952 'name' => 'body',
1953 'type' => HTML5::STARTTAG,
1954 'attr' => array()
1957 /* Act as if a start tag token with the tag name "hr" had
1958 been seen. */
1959 $this->inBody(array(
1960 'name' => 'hr',
1961 'type' => HTML5::STARTTAG,
1962 'attr' => array()
1965 /* Act as if a start tag token with the tag name "p" had
1966 been seen. */
1967 $this->inBody(array(
1968 'name' => 'p',
1969 'type' => HTML5::STARTTAG,
1970 'attr' => array()
1973 /* Act as if a start tag token with the tag name "label"
1974 had been seen. */
1975 $this->inBody(array(
1976 'name' => 'label',
1977 'type' => HTML5::STARTTAG,
1978 'attr' => array()
1981 /* Act as if a stream of character tokens had been seen. */
1982 $this->insertText('This is a searchable index. '.
1983 'Insert your search keywords here: ');
1985 /* Act as if a start tag token with the tag name "input"
1986 had been seen, with all the attributes from the "isindex"
1987 token, except with the "name" attribute set to the value
1988 "isindex" (ignoring any explicit "name" attribute). */
1989 $attr = $token['attr'];
1990 $attr[] = array('name' => 'name', 'value' => 'isindex');
1992 $this->inBody(array(
1993 'name' => 'input',
1994 'type' => HTML5::STARTTAG,
1995 'attr' => $attr
1998 /* Act as if a stream of character tokens had been seen
1999 (see below for what they should say). */
2000 $this->insertText('This is a searchable index. '.
2001 'Insert your search keywords here: ');
2003 /* Act as if an end tag token with the tag name "label"
2004 had been seen. */
2005 $this->inBody(array(
2006 'name' => 'label',
2007 'type' => HTML5::ENDTAG
2010 /* Act as if an end tag token with the tag name "p" had
2011 been seen. */
2012 $this->inBody(array(
2013 'name' => 'p',
2014 'type' => HTML5::ENDTAG
2017 /* Act as if a start tag token with the tag name "hr" had
2018 been seen. */
2019 $this->inBody(array(
2020 'name' => 'hr',
2021 'type' => HTML5::ENDTAG
2024 /* Act as if an end tag token with the tag name "form" had
2025 been seen. */
2026 $this->inBody(array(
2027 'name' => 'form',
2028 'type' => HTML5::ENDTAG
2031 break;
2033 /* A start tag whose tag name is "textarea" */
2034 case 'textarea':
2035 $this->insertElement($token);
2037 /* Switch the tokeniser's content model flag to the
2038 RCDATA state. */
2039 return HTML5::RCDATA;
2040 break;
2042 /* A start tag whose tag name is one of: "iframe", "noembed",
2043 "noframes" */
2044 case 'iframe': case 'noembed': case 'noframes':
2045 $this->insertElement($token);
2047 /* Switch the tokeniser's content model flag to the CDATA state. */
2048 return HTML5::CDATA;
2049 break;
2051 /* A start tag whose tag name is "select" */
2052 case 'select':
2053 /* Reconstruct the active formatting elements, if any. */
2054 $this->reconstructActiveFormattingElements();
2056 /* Insert an HTML element for the token. */
2057 $this->insertElement($token);
2059 /* Change the insertion mode to "in select". */
2060 $this->mode = self::IN_SELECT;
2061 break;
2063 /* A start or end tag whose tag name is one of: "caption", "col",
2064 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2065 "tbody", "td", "tfoot", "th", "thead", "tr". */
2066 case 'caption': case 'col': case 'colgroup': case 'frame':
2067 case 'frameset': case 'head': case 'option': case 'optgroup':
2068 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2069 case 'tr':
2070 // Parse error. Ignore the token.
2071 break;
2073 /* A start or end tag whose tag name is one of: "event-source",
2074 "section", "nav", "article", "aside", "header", "footer",
2075 "datagrid", "command" */
2076 case 'event-source': case 'section': case 'nav': case 'article':
2077 case 'aside': case 'header': case 'footer': case 'datagrid':
2078 case 'command':
2079 // Work in progress!
2080 break;
2082 /* A start tag token not covered by the previous entries */
2083 default:
2084 /* Reconstruct the active formatting elements, if any. */
2085 $this->reconstructActiveFormattingElements();
2087 $this->insertElement($token);
2088 break;
2090 break;
2092 case HTML5::ENDTAG:
2093 switch($token['name']) {
2094 /* An end tag with the tag name "body" */
2095 case 'body':
2096 /* If the second element in the stack of open elements is
2097 not a body element, this is a parse error. Ignore the token.
2098 (innerHTML case) */
2099 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2100 // Ignore.
2102 /* If the current node is not the body element, then this
2103 is a parse error. */
2104 } elseif(end($this->stack)->nodeName !== 'body') {
2105 // Parse error.
2108 /* Change the insertion mode to "after body". */
2109 $this->mode = self::AFTER_BODY;
2110 break;
2112 /* An end tag with the tag name "html" */
2113 case 'html':
2114 /* Act as if an end tag with tag name "body" had been seen,
2115 then, if that token wasn't ignored, reprocess the current
2116 token. */
2117 $this->inBody(array(
2118 'name' => 'body',
2119 'type' => HTML5::ENDTAG
2122 return $this->afterBody($token);
2123 break;
2125 /* An end tag whose tag name is one of: "address", "blockquote",
2126 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2127 "ol", "pre", "ul" */
2128 case 'address': case 'blockquote': case 'center': case 'dir':
2129 case 'div': case 'dl': case 'fieldset': case 'listing':
2130 case 'menu': case 'ol': case 'pre': case 'ul':
2131 /* If the stack of open elements has an element in scope
2132 with the same tag name as that of the token, then generate
2133 implied end tags. */
2134 if($this->elementInScope($token['name'])) {
2135 $this->generateImpliedEndTags();
2137 /* Now, if the current node is not an element with
2138 the same tag name as that of the token, then this
2139 is a parse error. */
2140 // w/e
2142 /* If the stack of open elements has an element in
2143 scope with the same tag name as that of the token,
2144 then pop elements from this stack until an element
2145 with that tag name has been popped from the stack. */
2146 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2147 if($this->stack[$n]->nodeName === $token['name']) {
2148 $n = -1;
2151 array_pop($this->stack);
2154 break;
2156 /* An end tag whose tag name is "form" */
2157 case 'form':
2158 /* If the stack of open elements has an element in scope
2159 with the same tag name as that of the token, then generate
2160 implied end tags. */
2161 if($this->elementInScope($token['name'])) {
2162 $this->generateImpliedEndTags();
2166 if(end($this->stack)->nodeName !== $token['name']) {
2167 /* Now, if the current node is not an element with the
2168 same tag name as that of the token, then this is a parse
2169 error. */
2170 // w/e
2172 } else {
2173 /* Otherwise, if the current node is an element with
2174 the same tag name as that of the token pop that element
2175 from the stack. */
2176 array_pop($this->stack);
2179 /* In any case, set the form element pointer to null. */
2180 $this->form_pointer = null;
2181 break;
2183 /* An end tag whose tag name is "p" */
2184 case 'p':
2185 /* If the stack of open elements has a p element in scope,
2186 then generate implied end tags, except for p elements. */
2187 if($this->elementInScope('p')) {
2188 $this->generateImpliedEndTags(array('p'));
2190 /* If the current node is not a p element, then this is
2191 a parse error. */
2192 // k
2194 /* If the stack of open elements has a p element in
2195 scope, then pop elements from this stack until the stack
2196 no longer has a p element in scope. */
2197 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2198 if($this->elementInScope('p')) {
2199 array_pop($this->stack);
2201 } else {
2202 break;
2206 break;
2208 /* An end tag whose tag name is "dd", "dt", or "li" */
2209 case 'dd': case 'dt': case 'li':
2210 /* If the stack of open elements has an element in scope
2211 whose tag name matches the tag name of the token, then
2212 generate implied end tags, except for elements with the
2213 same tag name as the token. */
2214 if($this->elementInScope($token['name'])) {
2215 $this->generateImpliedEndTags(array($token['name']));
2217 /* If the current node is not an element with the same
2218 tag name as the token, then this is a parse error. */
2219 // w/e
2221 /* If the stack of open elements has an element in scope
2222 whose tag name matches the tag name of the token, then
2223 pop elements from this stack until an element with that
2224 tag name has been popped from the stack. */
2225 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2226 if($this->stack[$n]->nodeName === $token['name']) {
2227 $n = -1;
2230 array_pop($this->stack);
2233 break;
2235 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2236 "h5", "h6" */
2237 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2238 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2240 /* If the stack of open elements has in scope an element whose
2241 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2242 generate implied end tags. */
2243 if($this->elementInScope($elements)) {
2244 $this->generateImpliedEndTags();
2246 /* Now, if the current node is not an element with the same
2247 tag name as that of the token, then this is a parse error. */
2248 // w/e
2250 /* If the stack of open elements has in scope an element
2251 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2252 "h6", then pop elements from the stack until an element
2253 with one of those tag names has been popped from the stack. */
2254 while($this->elementInScope($elements)) {
2255 array_pop($this->stack);
2258 break;
2260 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2261 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2262 case 'a': case 'b': case 'big': case 'em': case 'font':
2263 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2264 case 'strong': case 'tt': case 'u':
2265 /* 1. Let the formatting element be the last element in
2266 the list of active formatting elements that:
2267 * is between the end of the list and the last scope
2268 marker in the list, if any, or the start of the list
2269 otherwise, and
2270 * has the same tag name as the token.
2272 while(true) {
2273 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2274 if($this->a_formatting[$a] === self::MARKER) {
2275 break;
2277 } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2278 $formatting_element = $this->a_formatting[$a];
2279 $in_stack = in_array($formatting_element, $this->stack, true);
2280 $fe_af_pos = $a;
2281 break;
2285 /* If there is no such node, or, if that node is
2286 also in the stack of open elements but the element
2287 is not in scope, then this is a parse error. Abort
2288 these steps. The token is ignored. */
2289 if(!isset($formatting_element) || ($in_stack &&
2290 !$this->elementInScope($token['name']))) {
2291 break;
2293 /* Otherwise, if there is such a node, but that node
2294 is not in the stack of open elements, then this is a
2295 parse error; remove the element from the list, and
2296 abort these steps. */
2297 } elseif(isset($formatting_element) && !$in_stack) {
2298 unset($this->a_formatting[$fe_af_pos]);
2299 $this->a_formatting = array_merge($this->a_formatting);
2300 break;
2303 /* 2. Let the furthest block be the topmost node in the
2304 stack of open elements that is lower in the stack
2305 than the formatting element, and is not an element in
2306 the phrasing or formatting categories. There might
2307 not be one. */
2308 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2309 $length = count($this->stack);
2311 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2312 $category = $this->getElementCategory($this->stack[$s]->nodeName);
2314 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2315 $furthest_block = $this->stack[$s];
2319 /* 3. If there is no furthest block, then the UA must
2320 skip the subsequent steps and instead just pop all
2321 the nodes from the bottom of the stack of open
2322 elements, from the current node up to the formatting
2323 element, and remove the formatting element from the
2324 list of active formatting elements. */
2325 if(!isset($furthest_block)) {
2326 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2327 array_pop($this->stack);
2330 unset($this->a_formatting[$fe_af_pos]);
2331 $this->a_formatting = array_merge($this->a_formatting);
2332 break;
2335 /* 4. Let the common ancestor be the element
2336 immediately above the formatting element in the stack
2337 of open elements. */
2338 $common_ancestor = $this->stack[$fe_s_pos - 1];
2340 /* 5. If the furthest block has a parent node, then
2341 remove the furthest block from its parent node. */
2342 if($furthest_block->parentNode !== null) {
2343 $furthest_block->parentNode->removeChild($furthest_block);
2346 /* 6. Let a bookmark note the position of the
2347 formatting element in the list of active formatting
2348 elements relative to the elements on either side
2349 of it in the list. */
2350 $bookmark = $fe_af_pos;
2352 /* 7. Let node and last node be the furthest block.
2353 Follow these steps: */
2354 $node = $furthest_block;
2355 $last_node = $furthest_block;
2357 while(true) {
2358 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2359 /* 7.1 Let node be the element immediately
2360 prior to node in the stack of open elements. */
2361 $node = $this->stack[$n];
2363 /* 7.2 If node is not in the list of active
2364 formatting elements, then remove node from
2365 the stack of open elements and then go back
2366 to step 1. */
2367 if(!in_array($node, $this->a_formatting, true)) {
2368 unset($this->stack[$n]);
2369 $this->stack = array_merge($this->stack);
2371 } else {
2372 break;
2376 /* 7.3 Otherwise, if node is the formatting
2377 element, then go to the next step in the overall
2378 algorithm. */
2379 if($node === $formatting_element) {
2380 break;
2382 /* 7.4 Otherwise, if last node is the furthest
2383 block, then move the aforementioned bookmark to
2384 be immediately after the node in the list of
2385 active formatting elements. */
2386 } elseif($last_node === $furthest_block) {
2387 $bookmark = array_search($node, $this->a_formatting, true) + 1;
2390 /* 7.5 If node has any children, perform a
2391 shallow clone of node, replace the entry for
2392 node in the list of active formatting elements
2393 with an entry for the clone, replace the entry
2394 for node in the stack of open elements with an
2395 entry for the clone, and let node be the clone. */
2396 if($node->hasChildNodes()) {
2397 $clone = $node->cloneNode();
2398 $s_pos = array_search($node, $this->stack, true);
2399 $a_pos = array_search($node, $this->a_formatting, true);
2401 $this->stack[$s_pos] = $clone;
2402 $this->a_formatting[$a_pos] = $clone;
2403 $node = $clone;
2406 /* 7.6 Insert last node into node, first removing
2407 it from its previous parent node if any. */
2408 if($last_node->parentNode !== null) {
2409 $last_node->parentNode->removeChild($last_node);
2412 $node->appendChild($last_node);
2414 /* 7.7 Let last node be node. */
2415 $last_node = $node;
2418 /* 8. Insert whatever last node ended up being in
2419 the previous step into the common ancestor node,
2420 first removing it from its previous parent node if
2421 any. */
2422 if($last_node->parentNode !== null) {
2423 $last_node->parentNode->removeChild($last_node);
2426 $common_ancestor->appendChild($last_node);
2428 /* 9. Perform a shallow clone of the formatting
2429 element. */
2430 $clone = $formatting_element->cloneNode();
2432 /* 10. Take all of the child nodes of the furthest
2433 block and append them to the clone created in the
2434 last step. */
2435 while($furthest_block->hasChildNodes()) {
2436 $child = $furthest_block->firstChild;
2437 $furthest_block->removeChild($child);
2438 $clone->appendChild($child);
2441 /* 11. Append that clone to the furthest block. */
2442 $furthest_block->appendChild($clone);
2444 /* 12. Remove the formatting element from the list
2445 of active formatting elements, and insert the clone
2446 into the list of active formatting elements at the
2447 position of the aforementioned bookmark. */
2448 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2449 unset($this->a_formatting[$fe_af_pos]);
2450 $this->a_formatting = array_merge($this->a_formatting);
2452 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2453 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2454 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2456 /* 13. Remove the formatting element from the stack
2457 of open elements, and insert the clone into the stack
2458 of open elements immediately after (i.e. in a more
2459 deeply nested position than) the position of the
2460 furthest block in that stack. */
2461 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2462 $fb_s_pos = array_search($furthest_block, $this->stack, true);
2463 unset($this->stack[$fe_s_pos]);
2465 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2466 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2467 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2469 /* 14. Jump back to step 1 in this series of steps. */
2470 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2472 break;
2474 /* An end tag token whose tag name is one of: "button",
2475 "marquee", "object" */
2476 case 'button': case 'marquee': case 'object':
2477 /* If the stack of open elements has an element in scope whose
2478 tag name matches the tag name of the token, then generate implied
2479 tags. */
2480 if($this->elementInScope($token['name'])) {
2481 $this->generateImpliedEndTags();
2483 /* Now, if the current node is not an element with the same
2484 tag name as the token, then this is a parse error. */
2485 // k
2487 /* Now, if the stack of open elements has an element in scope
2488 whose tag name matches the tag name of the token, then pop
2489 elements from the stack until that element has been popped from
2490 the stack, and clear the list of active formatting elements up
2491 to the last marker. */
2492 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2493 if($this->stack[$n]->nodeName === $token['name']) {
2494 $n = -1;
2497 array_pop($this->stack);
2500 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2502 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2503 array_pop($this->a_formatting);
2506 break;
2508 /* Or an end tag whose tag name is one of: "area", "basefont",
2509 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2510 "input", "isindex", "noembed", "noframes", "param", "select",
2511 "spacer", "table", "textarea", "wbr" */
2512 case 'area': case 'basefont': case 'bgsound': case 'br':
2513 case 'embed': case 'hr': case 'iframe': case 'image':
2514 case 'img': case 'input': case 'isindex': case 'noembed':
2515 case 'noframes': case 'param': case 'select': case 'spacer':
2516 case 'table': case 'textarea': case 'wbr':
2517 // Parse error. Ignore the token.
2518 break;
2520 /* An end tag token not covered by the previous entries */
2521 default:
2522 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2523 /* Initialise node to be the current node (the bottommost
2524 node of the stack). */
2525 $node = end($this->stack);
2527 /* If node has the same tag name as the end tag token,
2528 then: */
2529 if($token['name'] === $node->nodeName) {
2530 /* Generate implied end tags. */
2531 $this->generateImpliedEndTags();
2533 /* If the tag name of the end tag token does not
2534 match the tag name of the current node, this is a
2535 parse error. */
2536 // k
2538 /* Pop all the nodes from the current node up to
2539 node, including node, then stop this algorithm. */
2540 for($x = count($this->stack) - $n; $x >= $n; $x--) {
2541 array_pop($this->stack);
2544 } else {
2545 $category = $this->getElementCategory($node);
2547 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2548 /* Otherwise, if node is in neither the formatting
2549 category nor the phrasing category, then this is a
2550 parse error. Stop this algorithm. The end tag token
2551 is ignored. */
2552 return false;
2556 break;
2558 break;
2562 private function inTable($token) {
2563 $clear = array('html', 'table');
2565 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2566 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2567 or U+0020 SPACE */
2568 if($token['type'] === HTML5::CHARACTR &&
2569 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2570 /* Append the character to the current node. */
2571 $text = $this->dom->createTextNode($token['data']);
2572 end($this->stack)->appendChild($text);
2574 /* A comment token */
2575 } elseif($token['type'] === HTML5::COMMENT) {
2576 /* Append a Comment node to the current node with the data
2577 attribute set to the data given in the comment token. */
2578 $comment = $this->dom->createComment($token['data']);
2579 end($this->stack)->appendChild($comment);
2581 /* A start tag whose tag name is "caption" */
2582 } elseif($token['type'] === HTML5::STARTTAG &&
2583 $token['name'] === 'caption') {
2584 /* Clear the stack back to a table context. */
2585 $this->clearStackToTableContext($clear);
2587 /* Insert a marker at the end of the list of active
2588 formatting elements. */
2589 $this->a_formatting[] = self::MARKER;
2591 /* Insert an HTML element for the token, then switch the
2592 insertion mode to "in caption". */
2593 $this->insertElement($token);
2594 $this->mode = self::IN_CAPTION;
2596 /* A start tag whose tag name is "colgroup" */
2597 } elseif($token['type'] === HTML5::STARTTAG &&
2598 $token['name'] === 'colgroup') {
2599 /* Clear the stack back to a table context. */
2600 $this->clearStackToTableContext($clear);
2602 /* Insert an HTML element for the token, then switch the
2603 insertion mode to "in column group". */
2604 $this->insertElement($token);
2605 $this->mode = self::IN_CGROUP;
2607 /* A start tag whose tag name is "col" */
2608 } elseif($token['type'] === HTML5::STARTTAG &&
2609 $token['name'] === 'col') {
2610 $this->inTable(array(
2611 'name' => 'colgroup',
2612 'type' => HTML5::STARTTAG,
2613 'attr' => array()
2616 $this->inColumnGroup($token);
2618 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2619 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2620 array('tbody', 'tfoot', 'thead'))) {
2621 /* Clear the stack back to a table context. */
2622 $this->clearStackToTableContext($clear);
2624 /* Insert an HTML element for the token, then switch the insertion
2625 mode to "in table body". */
2626 $this->insertElement($token);
2627 $this->mode = self::IN_TBODY;
2629 /* A start tag whose tag name is one of: "td", "th", "tr" */
2630 } elseif($token['type'] === HTML5::STARTTAG &&
2631 in_array($token['name'], array('td', 'th', 'tr'))) {
2632 /* Act as if a start tag token with the tag name "tbody" had been
2633 seen, then reprocess the current token. */
2634 $this->inTable(array(
2635 'name' => 'tbody',
2636 'type' => HTML5::STARTTAG,
2637 'attr' => array()
2640 return $this->inTableBody($token);
2642 /* A start tag whose tag name is "table" */
2643 } elseif($token['type'] === HTML5::STARTTAG &&
2644 $token['name'] === 'table') {
2645 /* Parse error. Act as if an end tag token with the tag name "table"
2646 had been seen, then, if that token wasn't ignored, reprocess the
2647 current token. */
2648 $this->inTable(array(
2649 'name' => 'table',
2650 'type' => HTML5::ENDTAG
2653 return $this->mainPhase($token);
2655 /* An end tag whose tag name is "table" */
2656 } elseif($token['type'] === HTML5::ENDTAG &&
2657 $token['name'] === 'table') {
2658 /* If the stack of open elements does not have an element in table
2659 scope with the same tag name as the token, this is a parse error.
2660 Ignore the token. (innerHTML case) */
2661 if(!$this->elementInScope($token['name'], true)) {
2662 return false;
2664 /* Otherwise: */
2665 } else {
2666 /* Generate implied end tags. */
2667 $this->generateImpliedEndTags();
2669 /* Now, if the current node is not a table element, then this
2670 is a parse error. */
2671 // w/e
2673 /* Pop elements from this stack until a table element has been
2674 popped from the stack. */
2675 while(true) {
2676 $current = end($this->stack)->nodeName;
2677 array_pop($this->stack);
2679 if($current === 'table') {
2680 break;
2684 /* Reset the insertion mode appropriately. */
2685 $this->resetInsertionMode();
2688 /* An end tag whose tag name is one of: "body", "caption", "col",
2689 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2690 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2691 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2692 'tfoot', 'th', 'thead', 'tr'))) {
2693 // Parse error. Ignore the token.
2695 /* Anything else */
2696 } else {
2697 /* Parse error. Process the token as if the insertion mode was "in
2698 body", with the following exception: */
2700 /* If the current node is a table, tbody, tfoot, thead, or tr
2701 element, then, whenever a node would be inserted into the current
2702 node, it must instead be inserted into the foster parent element. */
2703 if(in_array(end($this->stack)->nodeName,
2704 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2705 /* The foster parent element is the parent element of the last
2706 table element in the stack of open elements, if there is a
2707 table element and it has such a parent element. If there is no
2708 table element in the stack of open elements (innerHTML case),
2709 then the foster parent element is the first element in the
2710 stack of open elements (the html element). Otherwise, if there
2711 is a table element in the stack of open elements, but the last
2712 table element in the stack of open elements has no parent, or
2713 its parent node is not an element, then the foster parent
2714 element is the element before the last table element in the
2715 stack of open elements. */
2716 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2717 if($this->stack[$n]->nodeName === 'table') {
2718 $table = $this->stack[$n];
2719 break;
2723 if(isset($table) && $table->parentNode !== null) {
2724 $this->foster_parent = $table->parentNode;
2726 } elseif(!isset($table)) {
2727 $this->foster_parent = $this->stack[0];
2729 } elseif(isset($table) && ($table->parentNode === null ||
2730 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2731 $this->foster_parent = $this->stack[$n - 1];
2735 $this->inBody($token);
2739 private function inCaption($token) {
2740 /* An end tag whose tag name is "caption" */
2741 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2742 /* If the stack of open elements does not have an element in table
2743 scope with the same tag name as the token, this is a parse error.
2744 Ignore the token. (innerHTML case) */
2745 if(!$this->elementInScope($token['name'], true)) {
2746 // Ignore
2748 /* Otherwise: */
2749 } else {
2750 /* Generate implied end tags. */
2751 $this->generateImpliedEndTags();
2753 /* Now, if the current node is not a caption element, then this
2754 is a parse error. */
2755 // w/e
2757 /* Pop elements from this stack until a caption element has
2758 been popped from the stack. */
2759 while(true) {
2760 $node = end($this->stack)->nodeName;
2761 array_pop($this->stack);
2763 if($node === 'caption') {
2764 break;
2768 /* Clear the list of active formatting elements up to the last
2769 marker. */
2770 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2772 /* Switch the insertion mode to "in table". */
2773 $this->mode = self::IN_TABLE;
2776 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2777 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2778 name is "table" */
2779 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2780 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2781 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2782 $token['name'] === 'table')) {
2783 /* Parse error. Act as if an end tag with the tag name "caption"
2784 had been seen, then, if that token wasn't ignored, reprocess the
2785 current token. */
2786 $this->inCaption(array(
2787 'name' => 'caption',
2788 'type' => HTML5::ENDTAG
2791 return $this->inTable($token);
2793 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2794 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2795 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2796 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2797 'thead', 'tr'))) {
2798 // Parse error. Ignore the token.
2800 /* Anything else */
2801 } else {
2802 /* Process the token as if the insertion mode was "in body". */
2803 $this->inBody($token);
2807 private function inColumnGroup($token) {
2808 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2809 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2810 or U+0020 SPACE */
2811 if($token['type'] === HTML5::CHARACTR &&
2812 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2813 /* Append the character to the current node. */
2814 $text = $this->dom->createTextNode($token['data']);
2815 end($this->stack)->appendChild($text);
2817 /* A comment token */
2818 } elseif($token['type'] === HTML5::COMMENT) {
2819 /* Append a Comment node to the current node with the data
2820 attribute set to the data given in the comment token. */
2821 $comment = $this->dom->createComment($token['data']);
2822 end($this->stack)->appendChild($comment);
2824 /* A start tag whose tag name is "col" */
2825 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2826 /* Insert a col element for the token. Immediately pop the current
2827 node off the stack of open elements. */
2828 $this->insertElement($token);
2829 array_pop($this->stack);
2831 /* An end tag whose tag name is "colgroup" */
2832 } elseif($token['type'] === HTML5::ENDTAG &&
2833 $token['name'] === 'colgroup') {
2834 /* If the current node is the root html element, then this is a
2835 parse error, ignore the token. (innerHTML case) */
2836 if(end($this->stack)->nodeName === 'html') {
2837 // Ignore
2839 /* Otherwise, pop the current node (which will be a colgroup
2840 element) from the stack of open elements. Switch the insertion
2841 mode to "in table". */
2842 } else {
2843 array_pop($this->stack);
2844 $this->mode = self::IN_TABLE;
2847 /* An end tag whose tag name is "col" */
2848 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2849 /* Parse error. Ignore the token. */
2851 /* Anything else */
2852 } else {
2853 /* Act as if an end tag with the tag name "colgroup" had been seen,
2854 and then, if that token wasn't ignored, reprocess the current token. */
2855 $this->inColumnGroup(array(
2856 'name' => 'colgroup',
2857 'type' => HTML5::ENDTAG
2860 return $this->inTable($token);
2864 private function inTableBody($token) {
2865 $clear = array('tbody', 'tfoot', 'thead', 'html');
2867 /* A start tag whose tag name is "tr" */
2868 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2869 /* Clear the stack back to a table body context. */
2870 $this->clearStackToTableContext($clear);
2872 /* Insert a tr element for the token, then switch the insertion
2873 mode to "in row". */
2874 $this->insertElement($token);
2875 $this->mode = self::IN_ROW;
2877 /* A start tag whose tag name is one of: "th", "td" */
2878 } elseif($token['type'] === HTML5::STARTTAG &&
2879 ($token['name'] === 'th' || $token['name'] === 'td')) {
2880 /* Parse error. Act as if a start tag with the tag name "tr" had
2881 been seen, then reprocess the current token. */
2882 $this->inTableBody(array(
2883 'name' => 'tr',
2884 'type' => HTML5::STARTTAG,
2885 'attr' => array()
2888 return $this->inRow($token);
2890 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2891 } elseif($token['type'] === HTML5::ENDTAG &&
2892 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2893 /* If the stack of open elements does not have an element in table
2894 scope with the same tag name as the token, this is a parse error.
2895 Ignore the token. */
2896 if(!$this->elementInScope($token['name'], true)) {
2897 // Ignore
2899 /* Otherwise: */
2900 } else {
2901 /* Clear the stack back to a table body context. */
2902 $this->clearStackToTableContext($clear);
2904 /* Pop the current node from the stack of open elements. Switch
2905 the insertion mode to "in table". */
2906 array_pop($this->stack);
2907 $this->mode = self::IN_TABLE;
2910 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2911 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2912 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2913 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2914 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2915 /* If the stack of open elements does not have a tbody, thead, or
2916 tfoot element in table scope, this is a parse error. Ignore the
2917 token. (innerHTML case) */
2918 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2919 // Ignore.
2921 /* Otherwise: */
2922 } else {
2923 /* Clear the stack back to a table body context. */
2924 $this->clearStackToTableContext($clear);
2926 /* Act as if an end tag with the same tag name as the current
2927 node ("tbody", "tfoot", or "thead") had been seen, then
2928 reprocess the current token. */
2929 $this->inTableBody(array(
2930 'name' => end($this->stack)->nodeName,
2931 'type' => HTML5::ENDTAG
2934 return $this->mainPhase($token);
2937 /* An end tag whose tag name is one of: "body", "caption", "col",
2938 "colgroup", "html", "td", "th", "tr" */
2939 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2940 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
2941 /* Parse error. Ignore the token. */
2943 /* Anything else */
2944 } else {
2945 /* Process the token as if the insertion mode was "in table". */
2946 $this->inTable($token);
2950 private function inRow($token) {
2951 $clear = array('tr', 'html');
2953 /* A start tag whose tag name is one of: "th", "td" */
2954 if($token['type'] === HTML5::STARTTAG &&
2955 ($token['name'] === 'th' || $token['name'] === 'td')) {
2956 /* Clear the stack back to a table row context. */
2957 $this->clearStackToTableContext($clear);
2959 /* Insert an HTML element for the token, then switch the insertion
2960 mode to "in cell". */
2961 $this->insertElement($token);
2962 $this->mode = self::IN_CELL;
2964 /* Insert a marker at the end of the list of active formatting
2965 elements. */
2966 $this->a_formatting[] = self::MARKER;
2968 /* An end tag whose tag name is "tr" */
2969 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
2970 /* If the stack of open elements does not have an element in table
2971 scope with the same tag name as the token, this is a parse error.
2972 Ignore the token. (innerHTML case) */
2973 if(!$this->elementInScope($token['name'], true)) {
2974 // Ignore.
2976 /* Otherwise: */
2977 } else {
2978 /* Clear the stack back to a table row context. */
2979 $this->clearStackToTableContext($clear);
2981 /* Pop the current node (which will be a tr element) from the
2982 stack of open elements. Switch the insertion mode to "in table
2983 body". */
2984 array_pop($this->stack);
2985 $this->mode = self::IN_TBODY;
2988 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2989 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
2990 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2991 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
2992 /* Act as if an end tag with the tag name "tr" had been seen, then,
2993 if that token wasn't ignored, reprocess the current token. */
2994 $this->inRow(array(
2995 'name' => 'tr',
2996 'type' => HTML5::ENDTAG
2999 return $this->inCell($token);
3001 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3002 } elseif($token['type'] === HTML5::ENDTAG &&
3003 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3004 /* If the stack of open elements does not have an element in table
3005 scope with the same tag name as the token, this is a parse error.
3006 Ignore the token. */
3007 if(!$this->elementInScope($token['name'], true)) {
3008 // Ignore.
3010 /* Otherwise: */
3011 } else {
3012 /* Otherwise, act as if an end tag with the tag name "tr" had
3013 been seen, then reprocess the current token. */
3014 $this->inRow(array(
3015 'name' => 'tr',
3016 'type' => HTML5::ENDTAG
3019 return $this->inCell($token);
3022 /* An end tag whose tag name is one of: "body", "caption", "col",
3023 "colgroup", "html", "td", "th" */
3024 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3025 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3026 /* Parse error. Ignore the token. */
3028 /* Anything else */
3029 } else {
3030 /* Process the token as if the insertion mode was "in table". */
3031 $this->inTable($token);
3035 private function inCell($token) {
3036 /* An end tag whose tag name is one of: "td", "th" */
3037 if($token['type'] === HTML5::ENDTAG &&
3038 ($token['name'] === 'td' || $token['name'] === 'th')) {
3039 /* If the stack of open elements does not have an element in table
3040 scope with the same tag name as that of the token, then this is a
3041 parse error and the token must be ignored. */
3042 if(!$this->elementInScope($token['name'], true)) {
3043 // Ignore.
3045 /* Otherwise: */
3046 } else {
3047 /* Generate implied end tags, except for elements with the same
3048 tag name as the token. */
3049 $this->generateImpliedEndTags(array($token['name']));
3051 /* Now, if the current node is not an element with the same tag
3052 name as the token, then this is a parse error. */
3053 // k
3055 /* Pop elements from this stack until an element with the same
3056 tag name as the token has been popped from the stack. */
3057 while(true) {
3058 $node = end($this->stack)->nodeName;
3059 array_pop($this->stack);
3061 if($node === $token['name']) {
3062 break;
3066 /* Clear the list of active formatting elements up to the last
3067 marker. */
3068 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3070 /* Switch the insertion mode to "in row". (The current node
3071 will be a tr element at this point.) */
3072 $this->mode = self::IN_ROW;
3075 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3076 "tbody", "td", "tfoot", "th", "thead", "tr" */
3077 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3078 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3079 'thead', 'tr'))) {
3080 /* If the stack of open elements does not have a td or th element
3081 in table scope, then this is a parse error; ignore the token.
3082 (innerHTML case) */
3083 if(!$this->elementInScope(array('td', 'th'), true)) {
3084 // Ignore.
3086 /* Otherwise, close the cell (see below) and reprocess the current
3087 token. */
3088 } else {
3089 $this->closeCell();
3090 return $this->inRow($token);
3093 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3094 "tbody", "td", "tfoot", "th", "thead", "tr" */
3095 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3096 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3097 'thead', 'tr'))) {
3098 /* If the stack of open elements does not have a td or th element
3099 in table scope, then this is a parse error; ignore the token.
3100 (innerHTML case) */
3101 if(!$this->elementInScope(array('td', 'th'), true)) {
3102 // Ignore.
3104 /* Otherwise, close the cell (see below) and reprocess the current
3105 token. */
3106 } else {
3107 $this->closeCell();
3108 return $this->inRow($token);
3111 /* An end tag whose tag name is one of: "body", "caption", "col",
3112 "colgroup", "html" */
3113 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3114 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3115 /* Parse error. Ignore the token. */
3117 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3118 "thead", "tr" */
3119 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3120 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3121 /* If the stack of open elements does not have an element in table
3122 scope with the same tag name as that of the token (which can only
3123 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3124 then this is a parse error and the token must be ignored. */
3125 if(!$this->elementInScope($token['name'], true)) {
3126 // Ignore.
3128 /* Otherwise, close the cell (see below) and reprocess the current
3129 token. */
3130 } else {
3131 $this->closeCell();
3132 return $this->inRow($token);
3135 /* Anything else */
3136 } else {
3137 /* Process the token as if the insertion mode was "in body". */
3138 $this->inBody($token);
3142 private function inSelect($token) {
3143 /* Handle the token as follows: */
3145 /* A character token */
3146 if($token['type'] === HTML5::CHARACTR) {
3147 /* Append the token's character to the current node. */
3148 $this->insertText($token['data']);
3150 /* A comment token */
3151 } elseif($token['type'] === HTML5::COMMENT) {
3152 /* Append a Comment node to the current node with the data
3153 attribute set to the data given in the comment token. */
3154 $this->insertComment($token['data']);
3156 /* A start tag token whose tag name is "option" */
3157 } elseif($token['type'] === HTML5::STARTTAG &&
3158 $token['name'] === 'option') {
3159 /* If the current node is an option element, act as if an end tag
3160 with the tag name "option" had been seen. */
3161 if(end($this->stack)->nodeName === 'option') {
3162 $this->inSelect(array(
3163 'name' => 'option',
3164 'type' => HTML5::ENDTAG
3168 /* Insert an HTML element for the token. */
3169 $this->insertElement($token);
3171 /* A start tag token whose tag name is "optgroup" */
3172 } elseif($token['type'] === HTML5::STARTTAG &&
3173 $token['name'] === 'optgroup') {
3174 /* If the current node is an option element, act as if an end tag
3175 with the tag name "option" had been seen. */
3176 if(end($this->stack)->nodeName === 'option') {
3177 $this->inSelect(array(
3178 'name' => 'option',
3179 'type' => HTML5::ENDTAG
3183 /* If the current node is an optgroup element, act as if an end tag
3184 with the tag name "optgroup" had been seen. */
3185 if(end($this->stack)->nodeName === 'optgroup') {
3186 $this->inSelect(array(
3187 'name' => 'optgroup',
3188 'type' => HTML5::ENDTAG
3192 /* Insert an HTML element for the token. */
3193 $this->insertElement($token);
3195 /* An end tag token whose tag name is "optgroup" */
3196 } elseif($token['type'] === HTML5::ENDTAG &&
3197 $token['name'] === 'optgroup') {
3198 /* First, if the current node is an option element, and the node
3199 immediately before it in the stack of open elements is an optgroup
3200 element, then act as if an end tag with the tag name "option" had
3201 been seen. */
3202 $elements_in_stack = count($this->stack);
3204 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3205 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3206 $this->inSelect(array(
3207 'name' => 'option',
3208 'type' => HTML5::ENDTAG
3212 /* If the current node is an optgroup element, then pop that node
3213 from the stack of open elements. Otherwise, this is a parse error,
3214 ignore the token. */
3215 if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3216 array_pop($this->stack);
3219 /* An end tag token whose tag name is "option" */
3220 } elseif($token['type'] === HTML5::ENDTAG &&
3221 $token['name'] === 'option') {
3222 /* If the current node is an option element, then pop that node
3223 from the stack of open elements. Otherwise, this is a parse error,
3224 ignore the token. */
3225 if(end($this->stack)->nodeName === 'option') {
3226 array_pop($this->stack);
3229 /* An end tag whose tag name is "select" */
3230 } elseif($token['type'] === HTML5::ENDTAG &&
3231 $token['name'] === 'select') {
3232 /* If the stack of open elements does not have an element in table
3233 scope with the same tag name as the token, this is a parse error.
3234 Ignore the token. (innerHTML case) */
3235 if(!$this->elementInScope($token['name'], true)) {
3236 // w/e
3238 /* Otherwise: */
3239 } else {
3240 /* Pop elements from the stack of open elements until a select
3241 element has been popped from the stack. */
3242 while(true) {
3243 $current = end($this->stack)->nodeName;
3244 array_pop($this->stack);
3246 if($current === 'select') {
3247 break;
3251 /* Reset the insertion mode appropriately. */
3252 $this->resetInsertionMode();
3255 /* A start tag whose tag name is "select" */
3256 } elseif($token['name'] === 'select' &&
3257 $token['type'] === HTML5::STARTTAG) {
3258 /* Parse error. Act as if the token had been an end tag with the
3259 tag name "select" instead. */
3260 $this->inSelect(array(
3261 'name' => 'select',
3262 'type' => HTML5::ENDTAG
3265 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3266 "tfoot", "thead", "tr", "td", "th" */
3267 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3268 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3269 /* Parse error. */
3270 // w/e
3272 /* If the stack of open elements has an element in table scope with
3273 the same tag name as that of the token, then act as if an end tag
3274 with the tag name "select" had been seen, and reprocess the token.
3275 Otherwise, ignore the token. */
3276 if($this->elementInScope($token['name'], true)) {
3277 $this->inSelect(array(
3278 'name' => 'select',
3279 'type' => HTML5::ENDTAG
3282 $this->mainPhase($token);
3285 /* Anything else */
3286 } else {
3287 /* Parse error. Ignore the token. */
3291 private function afterBody($token) {
3292 /* Handle the token as follows: */
3294 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3295 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3296 or U+0020 SPACE */
3297 if($token['type'] === HTML5::CHARACTR &&
3298 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3299 /* Process the token as it would be processed if the insertion mode
3300 was "in body". */
3301 $this->inBody($token);
3303 /* A comment token */
3304 } elseif($token['type'] === HTML5::COMMENT) {
3305 /* Append a Comment node to the first element in the stack of open
3306 elements (the html element), with the data attribute set to the
3307 data given in the comment token. */
3308 $comment = $this->dom->createComment($token['data']);
3309 $this->stack[0]->appendChild($comment);
3311 /* An end tag with the tag name "html" */
3312 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3313 /* If the parser was originally created in order to handle the
3314 setting of an element's innerHTML attribute, this is a parse error;
3315 ignore the token. (The element will be an html element in this
3316 case.) (innerHTML case) */
3318 /* Otherwise, switch to the trailing end phase. */
3319 $this->phase = self::END_PHASE;
3321 /* Anything else */
3322 } else {
3323 /* Parse error. Set the insertion mode to "in body" and reprocess
3324 the token. */
3325 $this->mode = self::IN_BODY;
3326 return $this->inBody($token);
3330 private function inFrameset($token) {
3331 /* Handle the token as follows: */
3333 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3334 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3335 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3336 if($token['type'] === HTML5::CHARACTR &&
3337 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3338 /* Append the character to the current node. */
3339 $this->insertText($token['data']);
3341 /* A comment token */
3342 } elseif($token['type'] === HTML5::COMMENT) {
3343 /* Append a Comment node to the current node with the data
3344 attribute set to the data given in the comment token. */
3345 $this->insertComment($token['data']);
3347 /* A start tag with the tag name "frameset" */
3348 } elseif($token['name'] === 'frameset' &&
3349 $token['type'] === HTML5::STARTTAG) {
3350 $this->insertElement($token);
3352 /* An end tag with the tag name "frameset" */
3353 } elseif($token['name'] === 'frameset' &&
3354 $token['type'] === HTML5::ENDTAG) {
3355 /* If the current node is the root html element, then this is a
3356 parse error; ignore the token. (innerHTML case) */
3357 if(end($this->stack)->nodeName === 'html') {
3358 // Ignore
3360 } else {
3361 /* Otherwise, pop the current node from the stack of open
3362 elements. */
3363 array_pop($this->stack);
3365 /* If the parser was not originally created in order to handle
3366 the setting of an element's innerHTML attribute (innerHTML case),
3367 and the current node is no longer a frameset element, then change
3368 the insertion mode to "after frameset". */
3369 $this->mode = self::AFTR_FRAME;
3372 /* A start tag with the tag name "frame" */
3373 } elseif($token['name'] === 'frame' &&
3374 $token['type'] === HTML5::STARTTAG) {
3375 /* Insert an HTML element for the token. */
3376 $this->insertElement($token);
3378 /* Immediately pop the current node off the stack of open elements. */
3379 array_pop($this->stack);
3381 /* A start tag with the tag name "noframes" */
3382 } elseif($token['name'] === 'noframes' &&
3383 $token['type'] === HTML5::STARTTAG) {
3384 /* Process the token as if the insertion mode had been "in body". */
3385 $this->inBody($token);
3387 /* Anything else */
3388 } else {
3389 /* Parse error. Ignore the token. */
3393 private function afterFrameset($token) {
3394 /* Handle the token as follows: */
3396 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3397 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3398 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3399 if($token['type'] === HTML5::CHARACTR &&
3400 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3401 /* Append the character to the current node. */
3402 $this->insertText($token['data']);
3404 /* A comment token */
3405 } elseif($token['type'] === HTML5::COMMENT) {
3406 /* Append a Comment node to the current node with the data
3407 attribute set to the data given in the comment token. */
3408 $this->insertComment($token['data']);
3410 /* An end tag with the tag name "html" */
3411 } elseif($token['name'] === 'html' &&
3412 $token['type'] === HTML5::ENDTAG) {
3413 /* Switch to the trailing end phase. */
3414 $this->phase = self::END_PHASE;
3416 /* A start tag with the tag name "noframes" */
3417 } elseif($token['name'] === 'noframes' &&
3418 $token['type'] === HTML5::STARTTAG) {
3419 /* Process the token as if the insertion mode had been "in body". */
3420 $this->inBody($token);
3422 /* Anything else */
3423 } else {
3424 /* Parse error. Ignore the token. */
3428 private function trailingEndPhase($token) {
3429 /* After the main phase, as each token is emitted from the tokenisation
3430 stage, it must be processed as described in this section. */
3432 /* A DOCTYPE token */
3433 if($token['type'] === HTML5::DOCTYPE) {
3434 // Parse error. Ignore the token.
3436 /* A comment token */
3437 } elseif($token['type'] === HTML5::COMMENT) {
3438 /* Append a Comment node to the Document object with the data
3439 attribute set to the data given in the comment token. */
3440 $comment = $this->dom->createComment($token['data']);
3441 $this->dom->appendChild($comment);
3443 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3444 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3445 or U+0020 SPACE */
3446 } elseif($token['type'] === HTML5::CHARACTR &&
3447 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3448 /* Process the token as it would be processed in the main phase. */
3449 $this->mainPhase($token);
3451 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3452 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3453 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3454 } elseif(($token['type'] === HTML5::CHARACTR &&
3455 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3456 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3457 /* Parse error. Switch back to the main phase and reprocess the
3458 token. */
3459 $this->phase = self::MAIN_PHASE;
3460 return $this->mainPhase($token);
3462 /* An end-of-file token */
3463 } elseif($token['type'] === HTML5::EOF) {
3464 /* OMG DONE!! */
3468 private function insertElement($token, $append = true) {
3469 $el = $this->dom->createElement($token['name']);
3471 foreach($token['attr'] as $attr) {
3472 if(!$el->hasAttribute($attr['name'])) {
3473 $el->setAttribute($attr['name'], $attr['value']);
3477 $this->appendToRealParent($el);
3478 $this->stack[] = $el;
3480 return $el;
3483 private function insertText(