composer package updates
[openemr.git] / vendor / dompdf / dompdf / lib / html5lib / TreeBuilder.php
blobcc8cbc6aa3da63e63eb3e7a5924e3a394f3a079c
1 <?php
3 /*
5 Copyright 2007 Jeroen van der Meer <http://jero.net/>
6 Copyright 2009 Edward Z. Yang <edwardzyang@thewritingpot.com>
8 Permission is hereby granted, free of charge, to any person obtaining a
9 copy of this software and associated documentation files (the
10 "Software"), to deal in the Software without restriction, including
11 without limitation the rights to use, copy, modify, merge, publish,
12 distribute, sublicense, and/or sell copies of the Software, and to
13 permit persons to whom the Software is furnished to do so, subject to
14 the following conditions:
16 The above copyright notice and this permission notice shall be included
17 in all copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 // Tags for FIX ME!!!: (in order of priority)
30 // XXX - should be fixed NAO!
31 // XERROR - with regards to parse errors
32 // XSCRIPT - with regards to scripting mode
33 // XENCODING - with regards to encoding (for reparsing tests)
34 // XDOM - DOM specific code (tagName is explicitly not marked).
35 // this is not (yet) in helper functions.
37 class HTML5_TreeBuilder {
38 public $stack = array();
39 public $content_model;
41 private $mode;
42 private $original_mode;
43 private $secondary_mode;
44 private $dom;
45 // Whether or not normal insertion of nodes should actually foster
46 // parent (used in one case in spec)
47 private $foster_parent = false;
48 private $a_formatting = array();
50 private $head_pointer = null;
51 private $form_pointer = null;
53 private $flag_frameset_ok = true;
54 private $flag_force_quirks = false;
55 private $ignored = false;
56 private $quirks_mode = null;
57 // this gets to 2 when we want to ignore the next lf character, and
58 // is decrement at the beginning of each processed token (this way,
59 // code can check for (bool)$ignore_lf_token, but it phases out
60 // appropriately)
61 private $ignore_lf_token = 0;
62 private $fragment = false;
63 private $root;
65 private $scoping = array('applet','button','caption','html','marquee','object','table','td','th', 'svg:foreignObject');
66 private $formatting = array('a','b','big','code','em','font','i','nobr','s','small','strike','strong','tt','u');
67 // dl and ds are speculative
68 private $special = array('address','area','article','aside','base','basefont','bgsound',
69 'blockquote','body','br','center','col','colgroup','command','dc','dd','details','dir','div','dl','ds',
70 'dt','embed','fieldset','figure','footer','form','frame','frameset','h1','h2','h3','h4','h5',
71 'h6','head','header','hgroup','hr','iframe','img','input','isindex','li','link',
72 'listing','menu','meta','nav','noembed','noframes','noscript','ol',
73 'p','param','plaintext','pre','script','select','spacer','style',
74 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
76 private $pendingTableCharacters;
77 private $pendingTableCharactersDirty;
79 // Tree construction modes
80 const INITIAL = 0;
81 const BEFORE_HTML = 1;
82 const BEFORE_HEAD = 2;
83 const IN_HEAD = 3;
84 const IN_HEAD_NOSCRIPT = 4;
85 const AFTER_HEAD = 5;
86 const IN_BODY = 6;
87 const IN_CDATA_RCDATA = 7;
88 const IN_TABLE = 8;
89 const IN_TABLE_TEXT = 9;
90 const IN_CAPTION = 10;
91 const IN_COLUMN_GROUP = 11;
92 const IN_TABLE_BODY = 12;
93 const IN_ROW = 13;
94 const IN_CELL = 14;
95 const IN_SELECT = 15;
96 const IN_SELECT_IN_TABLE= 16;
97 const IN_FOREIGN_CONTENT= 17;
98 const AFTER_BODY = 18;
99 const IN_FRAMESET = 19;
100 const AFTER_FRAMESET = 20;
101 const AFTER_AFTER_BODY = 21;
102 const AFTER_AFTER_FRAMESET = 22;
105 * Converts a magic number to a readable name. Use for debugging.
107 private function strConst($number) {
108 static $lookup;
109 if (!$lookup) {
110 $lookup = array();
111 $r = new ReflectionClass('HTML5_TreeBuilder');
112 $consts = $r->getConstants();
113 foreach ($consts as $const => $num) {
114 if (!is_int($num)) {
115 continue;
117 $lookup[$num] = $const;
120 return $lookup[$number];
123 // The different types of elements.
124 const SPECIAL = 100;
125 const SCOPING = 101;
126 const FORMATTING = 102;
127 const PHRASING = 103;
129 // Quirks modes in $quirks_mode
130 const NO_QUIRKS = 200;
131 const QUIRKS_MODE = 201;
132 const LIMITED_QUIRKS_MODE = 202;
134 // Marker to be placed in $a_formatting
135 const MARKER = 300;
137 // Namespaces for foreign content
138 const NS_HTML = null; // to prevent DOM from requiring NS on everything
139 const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
140 const NS_SVG = 'http://www.w3.org/2000/svg';
141 const NS_XLINK = 'http://www.w3.org/1999/xlink';
142 const NS_XML = 'http://www.w3.org/XML/1998/namespace';
143 const NS_XMLNS = 'http://www.w3.org/2000/xmlns/';
145 // Different types of scopes to test for elements
146 const SCOPE = 0;
147 const SCOPE_LISTITEM = 1;
148 const SCOPE_TABLE = 2;
151 * HTML5_TreeBuilder constructor.
153 public function __construct() {
154 $this->mode = self::INITIAL;
155 $this->dom = new DOMDocument;
157 $this->dom->encoding = 'UTF-8';
158 $this->dom->preserveWhiteSpace = true;
159 $this->dom->substituteEntities = true;
160 $this->dom->strictErrorChecking = false;
163 public function getQuirksMode(){
164 return $this->quirks_mode;
168 * Process tag tokens
170 * @param $token
171 * @param null $mode
173 public function emitToken($token, $mode = null) {
174 // XXX: ignore parse errors... why are we emitting them, again?
175 if ($token['type'] === HTML5_Tokenizer::PARSEERROR) {
176 return;
178 if ($mode === null) {
179 $mode = $this->mode;
183 $backtrace = debug_backtrace();
184 if ($backtrace[1]['class'] !== 'HTML5_TreeBuilder') echo "--\n";
185 echo $this->strConst($mode);
186 if ($this->original_mode) echo " (originally ".$this->strConst($this->original_mode).")";
187 echo "\n ";
188 token_dump($token);
189 $this->printStack();
190 $this->printActiveFormattingElements();
191 if ($this->foster_parent) echo " -> this is a foster parent mode\n";
192 if ($this->flag_frameset_ok) echo " -> frameset ok\n";
195 if ($this->ignore_lf_token) {
196 $this->ignore_lf_token--;
198 $this->ignored = false;
200 switch ($mode) {
201 case self::INITIAL:
203 /* A character token that is one of U+0009 CHARACTER TABULATION,
204 * U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 SPACE */
205 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
206 /* Ignore the token. */
207 $this->ignored = true;
208 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
209 if (
210 $token['name'] !== 'html' || !empty($token['public']) ||
211 !empty($token['system']) || $token !== 'about:legacy-compat'
213 /* If the DOCTYPE token's name is not a case-sensitive match
214 * for the string "html", or if the token's public identifier
215 * is not missing, or if the token's system identifier is
216 * neither missing nor a case-sensitive match for the string
217 * "about:legacy-compat", then there is a parse error (this
218 * is the DOCTYPE parse error). */
219 // DOCTYPE parse error
221 /* Append a DocumentType node to the Document node, with the name
222 * attribute set to the name given in the DOCTYPE token, or the
223 * empty string if the name was missing; the publicId attribute
224 * set to the public identifier given in the DOCTYPE token, or
225 * the empty string if the public identifier was missing; the
226 * systemId attribute set to the system identifier given in the
227 * DOCTYPE token, or the empty string if the system identifier
228 * was missing; and the other attributes specific to
229 * DocumentType objects set to null and empty lists as
230 * appropriate. Associate the DocumentType node with the
231 * Document object so that it is returned as the value of the
232 * doctype attribute of the Document object. */
233 if (!isset($token['public'])) {
234 $token['public'] = null;
236 if (!isset($token['system'])) {
237 $token['system'] = null;
239 // XDOM
240 // Yes this is hacky. I'm kind of annoyed that I can't appendChild
241 // a doctype to DOMDocument. Maybe I haven't chanted the right
242 // syllables.
243 $impl = new DOMImplementation();
244 // This call can fail for particularly pathological cases (namely,
245 // the qualifiedName parameter ($token['name']) could be missing.
246 if ($token['name']) {
247 $doctype = $impl->createDocumentType($token['name'], $token['public'], $token['system']);
248 $this->dom->appendChild($doctype);
249 } else {
250 // It looks like libxml's not actually *able* to express this case.
251 // So... don't.
252 $this->dom->emptyDoctype = true;
254 $public = is_null($token['public']) ? false : strtolower($token['public']);
255 $system = is_null($token['system']) ? false : strtolower($token['system']);
256 $publicStartsWithForQuirks = array(
257 "+//silmaril//dtd html pro v0r11 19970101//",
258 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
259 "-//as//dtd html 3.0 aswedit + extensions//",
260 "-//ietf//dtd html 2.0 level 1//",
261 "-//ietf//dtd html 2.0 level 2//",
262 "-//ietf//dtd html 2.0 strict level 1//",
263 "-//ietf//dtd html 2.0 strict level 2//",
264 "-//ietf//dtd html 2.0 strict//",
265 "-//ietf//dtd html 2.0//",
266 "-//ietf//dtd html 2.1e//",
267 "-//ietf//dtd html 3.0//",
268 "-//ietf//dtd html 3.2 final//",
269 "-//ietf//dtd html 3.2//",
270 "-//ietf//dtd html 3//",
271 "-//ietf//dtd html level 0//",
272 "-//ietf//dtd html level 1//",
273 "-//ietf//dtd html level 2//",
274 "-//ietf//dtd html level 3//",
275 "-//ietf//dtd html strict level 0//",
276 "-//ietf//dtd html strict level 1//",
277 "-//ietf//dtd html strict level 2//",
278 "-//ietf//dtd html strict level 3//",
279 "-//ietf//dtd html strict//",
280 "-//ietf//dtd html//",
281 "-//metrius//dtd metrius presentational//",
282 "-//microsoft//dtd internet explorer 2.0 html strict//",
283 "-//microsoft//dtd internet explorer 2.0 html//",
284 "-//microsoft//dtd internet explorer 2.0 tables//",
285 "-//microsoft//dtd internet explorer 3.0 html strict//",
286 "-//microsoft//dtd internet explorer 3.0 html//",
287 "-//microsoft//dtd internet explorer 3.0 tables//",
288 "-//netscape comm. corp.//dtd html//",
289 "-//netscape comm. corp.//dtd strict html//",
290 "-//o'reilly and associates//dtd html 2.0//",
291 "-//o'reilly and associates//dtd html extended 1.0//",
292 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
293 "-//spyglass//dtd html 2.0 extended//",
294 "-//sq//dtd html 2.0 hotmetal + extensions//",
295 "-//sun microsystems corp.//dtd hotjava html//",
296 "-//sun microsystems corp.//dtd hotjava strict html//",
297 "-//w3c//dtd html 3 1995-03-24//",
298 "-//w3c//dtd html 3.2 draft//",
299 "-//w3c//dtd html 3.2 final//",
300 "-//w3c//dtd html 3.2//",
301 "-//w3c//dtd html 3.2s draft//",
302 "-//w3c//dtd html 4.0 frameset//",
303 "-//w3c//dtd html 4.0 transitional//",
304 "-//w3c//dtd html experimental 19960712//",
305 "-//w3c//dtd html experimental 970421//",
306 "-//w3c//dtd w3 html//",
307 "-//w3o//dtd w3 html 3.0//",
308 "-//webtechs//dtd mozilla html 2.0//",
309 "-//webtechs//dtd mozilla html//",
311 $publicSetToForQuirks = array(
312 "-//w3o//dtd w3 html strict 3.0//",
313 "-/w3c/dtd html 4.0 transitional/en",
314 "html",
316 $publicStartsWithAndSystemForQuirks = array(
317 "-//w3c//dtd html 4.01 frameset//",
318 "-//w3c//dtd html 4.01 transitional//",
320 $publicStartsWithForLimitedQuirks = array(
321 "-//w3c//dtd xhtml 1.0 frameset//",
322 "-//w3c//dtd xhtml 1.0 transitional//",
324 $publicStartsWithAndSystemForLimitedQuirks = array(
325 "-//w3c//dtd html 4.01 frameset//",
326 "-//w3c//dtd html 4.01 transitional//",
328 // first, do easy checks
329 if (
330 !empty($token['force-quirks']) ||
331 strtolower($token['name']) !== 'html'
333 $this->quirks_mode = self::QUIRKS_MODE;
334 } else {
335 do {
336 if ($system) {
337 foreach ($publicStartsWithAndSystemForQuirks as $x) {
338 if (strncmp($public, $x, strlen($x)) === 0) {
339 $this->quirks_mode = self::QUIRKS_MODE;
340 break;
343 if (!is_null($this->quirks_mode)) {
344 break;
346 foreach ($publicStartsWithAndSystemForLimitedQuirks as $x) {
347 if (strncmp($public, $x, strlen($x)) === 0) {
348 $this->quirks_mode = self::LIMITED_QUIRKS_MODE;
349 break;
352 if (!is_null($this->quirks_mode)) {
353 break;
356 foreach ($publicSetToForQuirks as $x) {
357 if ($public === $x) {
358 $this->quirks_mode = self::QUIRKS_MODE;
359 break;
362 if (!is_null($this->quirks_mode)) {
363 break;
365 foreach ($publicStartsWithForLimitedQuirks as $x) {
366 if (strncmp($public, $x, strlen($x)) === 0) {
367 $this->quirks_mode = self::LIMITED_QUIRKS_MODE;
370 if (!is_null($this->quirks_mode)) {
371 break;
373 if ($system === "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
374 $this->quirks_mode = self::QUIRKS_MODE;
375 break;
377 foreach ($publicStartsWithForQuirks as $x) {
378 if (strncmp($public, $x, strlen($x)) === 0) {
379 $this->quirks_mode = self::QUIRKS_MODE;
380 break;
383 if (is_null($this->quirks_mode)) {
384 $this->quirks_mode = self::NO_QUIRKS;
386 } while (false);
388 $this->mode = self::BEFORE_HTML;
389 } else {
390 // parse error
391 /* Switch the insertion mode to "before html", then reprocess the
392 * current token. */
393 $this->mode = self::BEFORE_HTML;
394 $this->quirks_mode = self::QUIRKS_MODE;
395 $this->emitToken($token);
397 break;
399 case self::BEFORE_HTML:
400 /* A DOCTYPE token */
401 if ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
402 // Parse error. Ignore the token.
403 $this->ignored = true;
405 /* A comment token */
406 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
407 /* Append a Comment node to the Document object with the data
408 attribute set to the data given in the comment token. */
409 // XDOM
410 $comment = $this->dom->createComment($token['data']);
411 $this->dom->appendChild($comment);
413 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
414 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
415 or U+0020 SPACE */
416 } elseif ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
417 /* Ignore the token. */
418 $this->ignored = true;
420 /* A start tag whose tag name is "html" */
421 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] == 'html') {
422 /* Create an element for the token in the HTML namespace. Append it
423 * to the Document object. Put this element in the stack of open
424 * elements. */
425 // XDOM
426 $html = $this->insertElement($token, false);
427 $this->dom->appendChild($html);
428 $this->stack[] = $html;
430 $this->mode = self::BEFORE_HEAD;
432 } else {
433 /* Create an html element. Append it to the Document object. Put
434 * this element in the stack of open elements. */
435 // XDOM
436 $html = $this->dom->createElementNS(self::NS_HTML, 'html');
437 $this->dom->appendChild($html);
438 $this->stack[] = $html;
440 /* Switch the insertion mode to "before head", then reprocess the
441 * current token. */
442 $this->mode = self::BEFORE_HEAD;
443 $this->emitToken($token);
445 break;
447 case self::BEFORE_HEAD:
448 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
449 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
450 or U+0020 SPACE */
451 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
452 /* Ignore the token. */
453 $this->ignored = true;
455 /* A comment token */
456 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
457 /* Append a Comment node to the current node with the data attribute
458 set to the data given in the comment token. */
459 $this->insertComment($token['data']);
461 /* A DOCTYPE token */
462 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
463 /* Parse error. Ignore the token */
464 $this->ignored = true;
465 // parse error
467 /* A start tag token with the tag name "html" */
468 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
469 /* Process the token using the rules for the "in body"
470 * insertion mode. */
471 $this->processWithRulesFor($token, self::IN_BODY);
473 /* A start tag token with the tag name "head" */
474 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') {
475 /* Insert an HTML element for the token. */
476 $element = $this->insertElement($token);
478 /* Set the head element pointer to this new element node. */
479 $this->head_pointer = $element;
481 /* Change the insertion mode to "in head". */
482 $this->mode = self::IN_HEAD;
484 /* An end tag whose tag name is one of: "head", "body", "html", "br" */
485 } elseif (
486 $token['type'] === HTML5_Tokenizer::ENDTAG && (
487 $token['name'] === 'head' || $token['name'] === 'body' ||
488 $token['name'] === 'html' || $token['name'] === 'br'
489 )) {
490 /* Act as if a start tag token with the tag name "head" and no
491 * attributes had been seen, then reprocess the current token. */
492 $this->emitToken(array(
493 'name' => 'head',
494 'type' => HTML5_Tokenizer::STARTTAG,
495 'attr' => array()
497 $this->emitToken($token);
499 /* Any other end tag */
500 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG) {
501 /* Parse error. Ignore the token. */
502 $this->ignored = true;
504 } else {
505 /* Act as if a start tag token with the tag name "head" and no
506 * attributes had been seen, then reprocess the current token.
507 * Note: This will result in an empty head element being
508 * generated, with the current token being reprocessed in the
509 * "after head" insertion mode. */
510 $this->emitToken(array(
511 'name' => 'head',
512 'type' => HTML5_Tokenizer::STARTTAG,
513 'attr' => array()
515 $this->emitToken($token);
517 break;
519 case self::IN_HEAD:
520 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
521 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
522 or U+0020 SPACE. */
523 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
524 /* Insert the character into the current node. */
525 $this->insertText($token['data']);
527 /* A comment token */
528 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
529 /* Append a Comment node to the current node with the data attribute
530 set to the data given in the comment token. */
531 $this->insertComment($token['data']);
533 /* A DOCTYPE token */
534 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
535 /* Parse error. Ignore the token. */
536 $this->ignored = true;
537 // parse error
539 /* A start tag whose tag name is "html" */
540 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
541 $token['name'] === 'html') {
542 $this->processWithRulesFor($token, self::IN_BODY);
544 /* A start tag whose tag name is one of: "base", "command", "link" */
545 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
546 ($token['name'] === 'base' || $token['name'] === 'command' ||
547 $token['name'] === 'link')) {
548 /* Insert an HTML element for the token. Immediately pop the
549 * current node off the stack of open elements. */
550 $this->insertElement($token);
551 array_pop($this->stack);
553 // YYY: Acknowledge the token's self-closing flag, if it is set.
555 /* A start tag whose tag name is "meta" */
556 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'meta') {
557 /* Insert an HTML element for the token. Immediately pop the
558 * current node off the stack of open elements. */
559 $this->insertElement($token);
560 array_pop($this->stack);
562 // XERROR: Acknowledge the token's self-closing flag, if it is set.
564 // XENCODING: If the element has a charset attribute, and its value is a
565 // supported encoding, and the confidence is currently tentative,
566 // then change the encoding to the encoding given by the value of
567 // the charset attribute.
569 // Otherwise, if the element has a content attribute, and applying
570 // the algorithm for extracting an encoding from a Content-Type to
571 // its value returns a supported encoding encoding, and the
572 // confidence is currently tentative, then change the encoding to
573 // the encoding encoding.
575 /* A start tag with the tag name "title" */
576 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'title') {
577 $this->insertRCDATAElement($token);
579 /* A start tag whose tag name is "noscript", if the scripting flag is enabled, or
580 * A start tag whose tag name is one of: "noframes", "style" */
581 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
582 ($token['name'] === 'noscript' || $token['name'] === 'noframes' || $token['name'] === 'style')) {
583 // XSCRIPT: Scripting flag not respected
584 $this->insertCDATAElement($token);
586 // XSCRIPT: Scripting flag disable not implemented
588 /* A start tag with the tag name "script" */
589 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') {
590 /* 1. Create an element for the token in the HTML namespace. */
591 $node = $this->insertElement($token, false);
593 /* 2. Mark the element as being "parser-inserted" */
594 // Uhhh... XSCRIPT
596 /* 3. If the parser was originally created for the HTML
597 * fragment parsing algorithm, then mark the script element as
598 * "already executed". (fragment case) */
599 // ditto... XSCRIPT
601 /* 4. Append the new element to the current node and push it onto
602 * the stack of open elements. */
603 end($this->stack)->appendChild($node);
604 $this->stack[] = $node;
605 // I guess we could squash these together
607 /* 6. Let the original insertion mode be the current insertion mode. */
608 $this->original_mode = $this->mode;
609 /* 7. Switch the insertion mode to "in CDATA/RCDATA" */
610 $this->mode = self::IN_CDATA_RCDATA;
611 /* 5. Switch the tokeniser's content model flag to the CDATA state. */
612 $this->content_model = HTML5_Tokenizer::CDATA;
614 /* An end tag with the tag name "head" */
615 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'head') {
616 /* Pop the current node (which will be the head element) off the stack of open elements. */
617 array_pop($this->stack);
619 /* Change the insertion mode to "after head". */
620 $this->mode = self::AFTER_HEAD;
622 // Slight logic inversion here to minimize duplication
623 /* A start tag with the tag name "head". */
624 /* An end tag whose tag name is not one of: "body", "html", "br" */
625 } elseif (($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') ||
626 ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] !== 'html' &&
627 $token['name'] !== 'body' && $token['name'] !== 'br')) {
628 // Parse error. Ignore the token.
629 $this->ignored = true;
631 /* Anything else */
632 } else {
633 /* Act as if an end tag token with the tag name "head" had been
634 * seen, and reprocess the current token. */
635 $this->emitToken(array(
636 'name' => 'head',
637 'type' => HTML5_Tokenizer::ENDTAG
640 /* Then, reprocess the current token. */
641 $this->emitToken($token);
643 break;
645 case self::IN_HEAD_NOSCRIPT:
646 if ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
647 // parse error
648 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
649 $this->processWithRulesFor($token, self::IN_BODY);
650 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'noscript') {
651 /* Pop the current node (which will be a noscript element) from the
652 * stack of open elements; the new current node will be a head
653 * element. */
654 array_pop($this->stack);
655 $this->mode = self::IN_HEAD;
656 } elseif (
657 ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) ||
658 ($token['type'] === HTML5_Tokenizer::COMMENT) ||
659 ($token['type'] === HTML5_Tokenizer::STARTTAG && (
660 $token['name'] === 'link' || $token['name'] === 'meta' ||
661 $token['name'] === 'noframes' || $token['name'] === 'style'))) {
662 $this->processWithRulesFor($token, self::IN_HEAD);
663 // inverted logic
664 } elseif (
665 ($token['type'] === HTML5_Tokenizer::STARTTAG && (
666 $token['name'] === 'head' || $token['name'] === 'noscript')) ||
667 ($token['type'] === HTML5_Tokenizer::ENDTAG &&
668 $token['name'] !== 'br')) {
669 // parse error
670 } else {
671 // parse error
672 $this->emitToken(array(
673 'type' => HTML5_Tokenizer::ENDTAG,
674 'name' => 'noscript',
676 $this->emitToken($token);
678 break;
680 case self::AFTER_HEAD:
681 /* Handle the token as follows: */
683 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
684 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
685 or U+0020 SPACE */
686 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
687 /* Append the character to the current node. */
688 $this->insertText($token['data']);
690 /* A comment token */
691 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
692 /* Append a Comment node to the current node with the data attribute
693 set to the data given in the comment token. */
694 $this->insertComment($token['data']);
696 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
697 // parse error
699 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
700 $this->processWithRulesFor($token, self::IN_BODY);
702 /* A start tag token with the tag name "body" */
703 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'body') {
704 $this->insertElement($token);
706 /* Set the frameset-ok flag to "not ok". */
707 $this->flag_frameset_ok = false;
709 /* Change the insertion mode to "in body". */
710 $this->mode = self::IN_BODY;
712 /* A start tag token with the tag name "frameset" */
713 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'frameset') {
714 /* Insert a frameset element for the token. */
715 $this->insertElement($token);
717 /* Change the insertion mode to "in frameset". */
718 $this->mode = self::IN_FRAMESET;
720 /* A start tag token whose tag name is one of: "base", "link", "meta",
721 "script", "style", "title" */
722 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
723 array('base', 'link', 'meta', 'noframes', 'script', 'style', 'title'))) {
724 // parse error
725 /* Push the node pointed to by the head element pointer onto the
726 * stack of open elements. */
727 $this->stack[] = $this->head_pointer;
728 $this->processWithRulesFor($token, self::IN_HEAD);
729 array_splice($this->stack, array_search($this->head_pointer, $this->stack, true), 1);
731 // inversion of specification
732 } elseif (
733 ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'head') ||
734 ($token['type'] === HTML5_Tokenizer::ENDTAG &&
735 $token['name'] !== 'body' && $token['name'] !== 'html' &&
736 $token['name'] !== 'br')) {
737 // parse error
739 /* Anything else */
740 } else {
741 $this->emitToken(array(
742 'name' => 'body',
743 'type' => HTML5_Tokenizer::STARTTAG,
744 'attr' => array()
746 $this->flag_frameset_ok = true;
747 $this->emitToken($token);
749 break;
751 case self::IN_BODY:
752 /* Handle the token as follows: */
754 switch($token['type']) {
755 /* A character token */
756 case HTML5_Tokenizer::CHARACTER:
757 case HTML5_Tokenizer::SPACECHARACTER:
758 /* Reconstruct the active formatting elements, if any. */
759 $this->reconstructActiveFormattingElements();
761 /* Append the token's character to the current node. */
762 $this->insertText($token['data']);
764 /* If the token is not one of U+0009 CHARACTER TABULATION,
765 * U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020
766 * SPACE, then set the frameset-ok flag to "not ok". */
767 // i.e., if any of the characters is not whitespace
768 if (strlen($token['data']) !== strspn($token['data'], HTML5_Tokenizer::WHITESPACE)) {
769 $this->flag_frameset_ok = false;
771 break;
773 /* A comment token */
774 case HTML5_Tokenizer::COMMENT:
775 /* Append a Comment node to the current node with the data
776 attribute set to the data given in the comment token. */
777 $this->insertComment($token['data']);
778 break;
780 case HTML5_Tokenizer::DOCTYPE:
781 // parse error
782 break;
784 case HTML5_Tokenizer::EOF:
785 // parse error
786 break;
788 case HTML5_Tokenizer::STARTTAG:
789 switch($token['name']) {
790 case 'html':
791 // parse error
792 /* For each attribute on the token, check to see if the
793 * attribute is already present on the top element of the
794 * stack of open elements. If it is not, add the attribute
795 * and its corresponding value to that element. */
796 foreach($token['attr'] as $attr) {
797 if (!$this->stack[0]->hasAttribute($attr['name'])) {
798 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
801 break;
803 case 'base': case 'command': case 'link': case 'meta': case 'noframes':
804 case 'script': case 'style': case 'title':
805 /* Process the token as if the insertion mode had been "in
806 head". */
807 $this->processWithRulesFor($token, self::IN_HEAD);
808 break;
810 /* A start tag token with the tag name "body" */
811 case 'body':
812 /* Parse error. If the second element on the stack of open
813 elements is not a body element, or, if the stack of open
814 elements has only one node on it, then ignore the token.
815 (fragment case) */
816 if (count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') {
817 $this->ignored = true;
818 // Ignore
820 /* Otherwise, for each attribute on the token, check to see
821 if the attribute is already present on the body element (the
822 second element) on the stack of open elements. If it is not,
823 add the attribute and its corresponding value to that
824 element. */
825 } else {
826 foreach($token['attr'] as $attr) {
827 if (!$this->stack[1]->hasAttribute($attr['name'])) {
828 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
832 break;
834 case 'frameset':
835 // parse error
836 /* If the second element on the stack of open elements is
837 * not a body element, or, if the stack of open elements
838 * has only one node on it, then ignore the token.
839 * (fragment case) */
840 if (count($this->stack) === 1 || $this->stack[1]->tagName !== 'body') {
841 $this->ignored = true;
842 // Ignore
843 } elseif (!$this->flag_frameset_ok) {
844 $this->ignored = true;
845 // Ignore
846 } else {
847 /* 1. Remove the second element on the stack of open
848 * elements from its parent node, if it has one. */
849 if ($this->stack[1]->parentNode) {
850 $this->stack[1]->parentNode->removeChild($this->stack[1]);
853 /* 2. Pop all the nodes from the bottom of the stack of
854 * open elements, from the current node up to the root
855 * html element. */
856 array_splice($this->stack, 1);
858 $this->insertElement($token);
859 $this->mode = self::IN_FRAMESET;
861 break;
863 // in spec, there is a diversion here
865 case 'address': case 'article': case 'aside': case 'blockquote':
866 case 'center': case 'datagrid': case 'details': case 'dir':
867 case 'div': case 'dl': case 'fieldset': case 'figure': case 'footer':
868 case 'header': case 'hgroup': case 'menu': case 'nav':
869 case 'ol': case 'p': case 'section': case 'ul':
870 /* If the stack of open elements has a p element in scope,
871 then act as if an end tag with the tag name p had been
872 seen. */
873 if ($this->elementInScope('p')) {
874 $this->emitToken(array(
875 'name' => 'p',
876 'type' => HTML5_Tokenizer::ENDTAG
880 /* Insert an HTML element for the token. */
881 $this->insertElement($token);
882 break;
884 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
885 "h5", "h6" */
886 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
887 /* If the stack of open elements has a p element in scope,
888 then act as if an end tag with the tag name p had been seen. */
889 if ($this->elementInScope('p')) {
890 $this->emitToken(array(
891 'name' => 'p',
892 'type' => HTML5_Tokenizer::ENDTAG
896 /* If the current node is an element whose tag name is one
897 * of "h1", "h2", "h3", "h4", "h5", or "h6", then this is a
898 * parse error; pop the current node off the stack of open
899 * elements. */
900 $peek = array_pop($this->stack);
901 if (in_array($peek->tagName, array("h1", "h2", "h3", "h4", "h5", "h6"))) {
902 // parse error
903 } else {
904 $this->stack[] = $peek;
907 /* Insert an HTML element for the token. */
908 $this->insertElement($token);
909 break;
911 case 'pre': case 'listing':
912 /* If the stack of open elements has a p element in scope,
913 then act as if an end tag with the tag name p had been seen. */
914 if ($this->elementInScope('p')) {
915 $this->emitToken(array(
916 'name' => 'p',
917 'type' => HTML5_Tokenizer::ENDTAG
920 $this->insertElement($token);
921 /* If the next token is a U+000A LINE FEED (LF) character
922 * token, then ignore that token and move on to the next
923 * one. (Newlines at the start of pre blocks are ignored as
924 * an authoring convenience.) */
925 $this->ignore_lf_token = 2;
926 $this->flag_frameset_ok = false;
927 break;
929 /* A start tag whose tag name is "form" */
930 case 'form':
931 /* If the form element pointer is not null, ignore the
932 token with a parse error. */
933 if ($this->form_pointer !== null) {
934 $this->ignored = true;
935 // Ignore.
937 /* Otherwise: */
938 } else {
939 /* If the stack of open elements has a p element in
940 scope, then act as if an end tag with the tag name p
941 had been seen. */
942 if ($this->elementInScope('p')) {
943 $this->emitToken(array(
944 'name' => 'p',
945 'type' => HTML5_Tokenizer::ENDTAG
949 /* Insert an HTML element for the token, and set the
950 form element pointer to point to the element created. */
951 $element = $this->insertElement($token);
952 $this->form_pointer = $element;
954 break;
956 // condensed specification
957 case 'li': case 'dc': case 'dd': case 'ds': case 'dt':
958 /* 1. Set the frameset-ok flag to "not ok". */
959 $this->flag_frameset_ok = false;
961 $stack_length = count($this->stack) - 1;
962 for($n = $stack_length; 0 <= $n; $n--) {
963 /* 2. Initialise node to be the current node (the
964 bottommost node of the stack). */
965 $stop = false;
966 $node = $this->stack[$n];
967 $cat = $this->getElementCategory($node);
969 // for case 'li':
970 /* 3. If node is an li element, then act as if an end
971 * tag with the tag name "li" had been seen, then jump
972 * to the last step. */
973 // for case 'dc': case 'dd': case 'ds': case 'dt':
974 /* If node is a dc, dd, ds or dt element, then act as if an end
975 * tag with the same tag name as node had been seen, then
976 * jump to the last step. */
977 if (($token['name'] === 'li' && $node->tagName === 'li') ||
978 ($token['name'] !== 'li' && ($node->tagName == 'dc' || $node->tagName === 'dd' || $node->tagName == 'ds' || $node->tagName === 'dt'))) { // limited conditional
979 $this->emitToken(array(
980 'type' => HTML5_Tokenizer::ENDTAG,
981 'name' => $node->tagName,
983 break;
986 /* 4. If node is not in the formatting category, and is
987 not in the phrasing category, and is not an address,
988 div or p element, then stop this algorithm. */
989 if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
990 $node->tagName !== 'address' && $node->tagName !== 'div' &&
991 $node->tagName !== 'p') {
992 break;
995 /* 5. Otherwise, set node to the previous entry in the
996 * stack of open elements and return to step 2. */
999 /* 6. This is the last step. */
1001 /* If the stack of open elements has a p element in scope,
1002 then act as if an end tag with the tag name p had been
1003 seen. */
1004 if ($this->elementInScope('p')) {
1005 $this->emitToken(array(
1006 'name' => 'p',
1007 'type' => HTML5_Tokenizer::ENDTAG
1011 /* Finally, insert an HTML element with the same tag
1012 name as the token's. */
1013 $this->insertElement($token);
1014 break;
1016 /* A start tag token whose tag name is "plaintext" */
1017 case 'plaintext':
1018 /* If the stack of open elements has a p element in scope,
1019 then act as if an end tag with the tag name p had been
1020 seen. */
1021 if ($this->elementInScope('p')) {
1022 $this->emitToken(array(
1023 'name' => 'p',
1024 'type' => HTML5_Tokenizer::ENDTAG
1028 /* Insert an HTML element for the token. */
1029 $this->insertElement($token);
1031 $this->content_model = HTML5_Tokenizer::PLAINTEXT;
1032 break;
1034 // more diversions
1036 /* A start tag whose tag name is "a" */
1037 case 'a':
1038 /* If the list of active formatting elements contains
1039 an element whose tag name is "a" between the end of the
1040 list and the last marker on the list (or the start of
1041 the list if there is no marker on the list), then this
1042 is a parse error; act as if an end tag with the tag name
1043 "a" had been seen, then remove that element from the list
1044 of active formatting elements and the stack of open
1045 elements if the end tag didn't already remove it (it
1046 might not have if the element is not in table scope). */
1047 $leng = count($this->a_formatting);
1049 for ($n = $leng - 1; $n >= 0; $n--) {
1050 if ($this->a_formatting[$n] === self::MARKER) {
1051 break;
1053 } elseif ($this->a_formatting[$n]->tagName === 'a') {
1054 $a = $this->a_formatting[$n];
1055 $this->emitToken(array(
1056 'name' => 'a',
1057 'type' => HTML5_Tokenizer::ENDTAG
1059 if (in_array($a, $this->a_formatting)) {
1060 $a_i = array_search($a, $this->a_formatting, true);
1061 if ($a_i !== false) {
1062 array_splice($this->a_formatting, $a_i, 1);
1065 if (in_array($a, $this->stack)) {
1066 $a_i = array_search($a, $this->stack, true);
1067 if ($a_i !== false) {
1068 array_splice($this->stack, $a_i, 1);
1071 break;
1075 /* Reconstruct the active formatting elements, if any. */
1076 $this->reconstructActiveFormattingElements();
1078 /* Insert an HTML element for the token. */
1079 $el = $this->insertElement($token);
1081 /* Add that element to the list of active formatting
1082 elements. */
1083 $this->a_formatting[] = $el;
1084 break;
1086 case 'b': case 'big': case 'code': case 'em': case 'font': case 'i':
1087 case 's': case 'small': case 'strike':
1088 case 'strong': case 'tt': case 'u':
1089 /* Reconstruct the active formatting elements, if any. */
1090 $this->reconstructActiveFormattingElements();
1092 /* Insert an HTML element for the token. */
1093 $el = $this->insertElement($token);
1095 /* Add that element to the list of active formatting
1096 elements. */
1097 $this->a_formatting[] = $el;
1098 break;
1100 case 'nobr':
1101 /* Reconstruct the active formatting elements, if any. */
1102 $this->reconstructActiveFormattingElements();
1104 /* If the stack of open elements has a nobr element in
1105 * scope, then this is a parse error; act as if an end tag
1106 * with the tag name "nobr" had been seen, then once again
1107 * reconstruct the active formatting elements, if any. */
1108 if ($this->elementInScope('nobr')) {
1109 $this->emitToken(array(
1110 'name' => 'nobr',
1111 'type' => HTML5_Tokenizer::ENDTAG,
1113 $this->reconstructActiveFormattingElements();
1116 /* Insert an HTML element for the token. */
1117 $el = $this->insertElement($token);
1119 /* Add that element to the list of active formatting
1120 elements. */
1121 $this->a_formatting[] = $el;
1122 break;
1124 // another diversion
1126 /* A start tag token whose tag name is "button" */
1127 case 'button':
1128 /* If the stack of open elements has a button element in scope,
1129 then this is a parse error; act as if an end tag with the tag
1130 name "button" had been seen, then reprocess the token. (We don't
1131 do that. Unnecessary.) (I hope you're right! -- ezyang) */
1132 if ($this->elementInScope('button')) {
1133 $this->emitToken(array(
1134 'name' => 'button',
1135 'type' => HTML5_Tokenizer::ENDTAG
1139 /* Reconstruct the active formatting elements, if any. */
1140 $this->reconstructActiveFormattingElements();
1142 /* Insert an HTML element for the token. */
1143 $this->insertElement($token);
1145 /* Insert a marker at the end of the list of active
1146 formatting elements. */
1147 $this->a_formatting[] = self::MARKER;
1149 $this->flag_frameset_ok = false;
1150 break;
1152 case 'applet': case 'marquee': case 'object':
1153 /* Reconstruct the active formatting elements, if any. */
1154 $this->reconstructActiveFormattingElements();
1156 /* Insert an HTML element for the token. */
1157 $this->insertElement($token);
1159 /* Insert a marker at the end of the list of active
1160 formatting elements. */
1161 $this->a_formatting[] = self::MARKER;
1163 $this->flag_frameset_ok = false;
1164 break;
1166 // spec diversion
1168 /* A start tag whose tag name is "table" */
1169 case 'table':
1170 /* If the Document is not set to quirks mode, and the
1171 * stack of open elements has a p element in scope, then
1172 * act as if an end tag with the tag name "p" had been
1173 * seen. */
1174 if ($this->quirks_mode !== self::QUIRKS_MODE &&
1175 $this->elementInScope('p')) {
1176 $this->emitToken(array(
1177 'name' => 'p',
1178 'type' => HTML5_Tokenizer::ENDTAG
1182 /* Insert an HTML element for the token. */
1183 $this->insertElement($token);
1185 $this->flag_frameset_ok = false;
1187 /* Change the insertion mode to "in table". */
1188 $this->mode = self::IN_TABLE;
1189 break;
1191 /* A start tag whose tag name is one of: "area", "basefont",
1192 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1193 case 'area': case 'basefont': case 'bgsound': case 'br':
1194 case 'embed': case 'img': case 'input': case 'keygen': case 'spacer':
1195 case 'wbr':
1196 /* Reconstruct the active formatting elements, if any. */
1197 $this->reconstructActiveFormattingElements();
1199 /* Insert an HTML element for the token. */
1200 $this->insertElement($token);
1202 /* Immediately pop the current node off the stack of open elements. */
1203 array_pop($this->stack);
1205 // YYY: Acknowledge the token's self-closing flag, if it is set.
1207 $this->flag_frameset_ok = false;
1208 break;
1210 case 'param': case 'source':
1211 /* Insert an HTML element for the token. */
1212 $this->insertElement($token);
1214 /* Immediately pop the current node off the stack of open elements. */
1215 array_pop($this->stack);
1217 // YYY: Acknowledge the token's self-closing flag, if it is set.
1218 break;
1220 /* A start tag whose tag name is "hr" */
1221 case 'hr':
1222 /* If the stack of open elements has a p element in scope,
1223 then act as if an end tag with the tag name p had been seen. */
1224 if ($this->elementInScope('p')) {
1225 $this->emitToken(array(
1226 'name' => 'p',
1227 'type' => HTML5_Tokenizer::ENDTAG
1231 /* Insert an HTML element for the token. */
1232 $this->insertElement($token);
1234 /* Immediately pop the current node off the stack of open elements. */
1235 array_pop($this->stack);
1237 // YYY: Acknowledge the token's self-closing flag, if it is set.
1239 $this->flag_frameset_ok = false;
1240 break;
1242 /* A start tag whose tag name is "image" */
1243 case 'image':
1244 /* Parse error. Change the token's tag name to "img" and
1245 reprocess it. (Don't ask.) */
1246 $token['name'] = 'img';
1247 $this->emitToken($token);
1248 break;
1250 /* A start tag whose tag name is "isindex" */
1251 case 'isindex':
1252 /* Parse error. */
1254 /* If the form element pointer is not null,
1255 then ignore the token. */
1256 if ($this->form_pointer === null) {
1257 /* Act as if a start tag token with the tag name "form" had
1258 been seen. */
1259 /* If the token has an attribute called "action", set
1260 * the action attribute on the resulting form
1261 * element to the value of the "action" attribute of
1262 * the token. */
1263 $attr = array();
1264 $action = $this->getAttr($token, 'action');
1265 if ($action !== false) {
1266 $attr[] = array('name' => 'action', 'value' => $action);
1268 $this->emitToken(array(
1269 'name' => 'form',
1270 'type' => HTML5_Tokenizer::STARTTAG,
1271 'attr' => $attr
1274 /* Act as if a start tag token with the tag name "hr" had
1275 been seen. */
1276 $this->emitToken(array(
1277 'name' => 'hr',
1278 'type' => HTML5_Tokenizer::STARTTAG,
1279 'attr' => array()
1282 /* Act as if a start tag token with the tag name "label"
1283 had been seen. */
1284 $this->emitToken(array(
1285 'name' => 'label',
1286 'type' => HTML5_Tokenizer::STARTTAG,
1287 'attr' => array()
1290 /* Act as if a stream of character tokens had been seen. */
1291 $prompt = $this->getAttr($token, 'prompt');
1292 if ($prompt === false) {
1293 $prompt = 'This is a searchable index. '.
1294 'Insert your search keywords here: ';
1296 $this->emitToken(array(
1297 'data' => $prompt,
1298 'type' => HTML5_Tokenizer::CHARACTER,
1301 /* Act as if a start tag token with the tag name "input"
1302 had been seen, with all the attributes from the "isindex"
1303 token, except with the "name" attribute set to the value
1304 "isindex" (ignoring any explicit "name" attribute). */
1305 $attr = array();
1306 foreach ($token['attr'] as $keypair) {
1307 if ($keypair['name'] === 'name' || $keypair['name'] === 'action' ||
1308 $keypair['name'] === 'prompt') {
1309 continue;
1311 $attr[] = $keypair;
1313 $attr[] = array('name' => 'name', 'value' => 'isindex');
1315 $this->emitToken(array(
1316 'name' => 'input',
1317 'type' => HTML5_Tokenizer::STARTTAG,
1318 'attr' => $attr
1321 /* Act as if an end tag token with the tag name "label"
1322 had been seen. */
1323 $this->emitToken(array(
1324 'name' => 'label',
1325 'type' => HTML5_Tokenizer::ENDTAG
1328 /* Act as if a start tag token with the tag name "hr" had
1329 been seen. */
1330 $this->emitToken(array(
1331 'name' => 'hr',
1332 'type' => HTML5_Tokenizer::STARTTAG
1335 /* Act as if an end tag token with the tag name "form" had
1336 been seen. */
1337 $this->emitToken(array(
1338 'name' => 'form',
1339 'type' => HTML5_Tokenizer::ENDTAG
1341 } else {
1342 $this->ignored = true;
1344 break;
1346 /* A start tag whose tag name is "textarea" */
1347 case 'textarea':
1348 $this->insertElement($token);
1350 /* If the next token is a U+000A LINE FEED (LF)
1351 * character token, then ignore that token and move on to
1352 * the next one. (Newlines at the start of textarea
1353 * elements are ignored as an authoring convenience.)
1354 * need flag, see also <pre> */
1355 $this->ignore_lf_token = 2;
1357 $this->original_mode = $this->mode;
1358 $this->flag_frameset_ok = false;
1359 $this->mode = self::IN_CDATA_RCDATA;
1361 /* Switch the tokeniser's content model flag to the
1362 RCDATA state. */
1363 $this->content_model = HTML5_Tokenizer::RCDATA;
1364 break;
1366 /* A start tag token whose tag name is "xmp" */
1367 case 'xmp':
1368 /* If the stack of open elements has a p element in
1369 scope, then act as if an end tag with the tag name
1370 "p" has been seen. */
1371 if ($this->elementInScope('p')) {
1372 $this->emitToken(array(
1373 'name' => 'p',
1374 'type' => HTML5_Tokenizer::ENDTAG
1378 /* Reconstruct the active formatting elements, if any. */
1379 $this->reconstructActiveFormattingElements();
1381 $this->flag_frameset_ok = false;
1383 $this->insertCDATAElement($token);
1384 break;
1386 case 'iframe':
1387 $this->flag_frameset_ok = false;
1388 $this->insertCDATAElement($token);
1389 break;
1391 case 'noembed': case 'noscript':
1392 // XSCRIPT: should check scripting flag
1393 $this->insertCDATAElement($token);
1394 break;
1396 /* A start tag whose tag name is "select" */
1397 case 'select':
1398 /* Reconstruct the active formatting elements, if any. */
1399 $this->reconstructActiveFormattingElements();
1401 /* Insert an HTML element for the token. */
1402 $this->insertElement($token);
1404 $this->flag_frameset_ok = false;
1406 /* If the insertion mode is one of in table", "in caption",
1407 * "in column group", "in table body", "in row", or "in
1408 * cell", then switch the insertion mode to "in select in
1409 * table". Otherwise, switch the insertion mode to "in
1410 * select". */
1411 if (
1412 $this->mode === self::IN_TABLE || $this->mode === self::IN_CAPTION ||
1413 $this->mode === self::IN_COLUMN_GROUP || $this->mode ==+self::IN_TABLE_BODY ||
1414 $this->mode === self::IN_ROW || $this->mode === self::IN_CELL
1416 $this->mode = self::IN_SELECT_IN_TABLE;
1417 } else {
1418 $this->mode = self::IN_SELECT;
1420 break;
1422 case 'option': case 'optgroup':
1423 if ($this->elementInScope('option')) {
1424 $this->emitToken(array(
1425 'name' => 'option',
1426 'type' => HTML5_Tokenizer::ENDTAG,
1429 $this->reconstructActiveFormattingElements();
1430 $this->insertElement($token);
1431 break;
1433 case 'rp': case 'rt':
1434 /* If the stack of open elements has a ruby element in scope, then generate
1435 * implied end tags. If the current node is not then a ruby element, this is
1436 * a parse error; pop all the nodes from the current node up to the node
1437 * immediately before the bottommost ruby element on the stack of open elements.
1439 if ($this->elementInScope('ruby')) {
1440 $this->generateImpliedEndTags();
1442 $peek = false;
1443 do {
1444 /*if ($peek) {
1445 // parse error
1447 $peek = array_pop($this->stack);
1448 } while ($peek->tagName !== 'ruby');
1449 $this->stack[] = $peek; // we popped one too many
1450 $this->insertElement($token);
1451 break;
1453 // spec diversion
1455 case 'math':
1456 $this->reconstructActiveFormattingElements();
1457 $token = $this->adjustMathMLAttributes($token);
1458 $token = $this->adjustForeignAttributes($token);
1459 $this->insertForeignElement($token, self::NS_MATHML);
1460 if (isset($token['self-closing'])) {
1461 // XERROR: acknowledge the token's self-closing flag
1462 array_pop($this->stack);
1464 if ($this->mode !== self::IN_FOREIGN_CONTENT) {
1465 $this->secondary_mode = $this->mode;
1466 $this->mode = self::IN_FOREIGN_CONTENT;
1468 break;
1470 case 'svg':
1471 $this->reconstructActiveFormattingElements();
1472 $token = $this->adjustSVGAttributes($token);
1473 $token = $this->adjustForeignAttributes($token);
1474 $this->insertForeignElement($token, self::NS_SVG);
1475 if (isset($token['self-closing'])) {
1476 // XERROR: acknowledge the token's self-closing flag
1477 array_pop($this->stack);
1479 if ($this->mode !== self::IN_FOREIGN_CONTENT) {
1480 $this->secondary_mode = $this->mode;
1481 $this->mode = self::IN_FOREIGN_CONTENT;
1483 break;
1485 case 'caption': case 'col': case 'colgroup': case 'frame': case 'head':
1486 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': case 'tr':
1487 // parse error
1488 break;
1490 /* A start tag token not covered by the previous entries */
1491 default:
1492 /* Reconstruct the active formatting elements, if any. */
1493 $this->reconstructActiveFormattingElements();
1495 $this->insertElement($token);
1496 /* This element will be a phrasing element. */
1497 break;
1499 break;
1501 case HTML5_Tokenizer::ENDTAG:
1502 switch ($token['name']) {
1503 /* An end tag with the tag name "body" */
1504 case 'body':
1505 /* If the stack of open elements does not have a body
1506 * element in scope, this is a parse error; ignore the
1507 * token. */
1508 if (!$this->elementInScope('body')) {
1509 $this->ignored = true;
1511 /* Otherwise, if there is a node in the stack of open
1512 * elements that is not either a dc element, a dd element,
1513 * a ds element, a dt element, an li element, an optgroup
1514 * element, an option element, a p element, an rp element,
1515 * an rt element, a tbody element, a td element, a tfoot
1516 * element, a th element, a thead element, a tr element,
1517 * the body element, or the html element, then this is a
1518 * parse error.
1520 } else {
1521 // XERROR: implement this check for parse error
1524 /* Change the insertion mode to "after body". */
1525 $this->mode = self::AFTER_BODY;
1526 break;
1528 /* An end tag with the tag name "html" */
1529 case 'html':
1530 /* Act as if an end tag with tag name "body" had been seen,
1531 then, if that token wasn't ignored, reprocess the current
1532 token. */
1533 $this->emitToken(array(
1534 'name' => 'body',
1535 'type' => HTML5_Tokenizer::ENDTAG
1538 if (!$this->ignored) {
1539 $this->emitToken($token);
1541 break;
1543 case 'address': case 'article': case 'aside': case 'blockquote':
1544 case 'center': case 'datagrid': case 'details': case 'dir':
1545 case 'div': case 'dl': case 'fieldset': case 'footer':
1546 case 'header': case 'hgroup': case 'listing': case 'menu':
1547 case 'nav': case 'ol': case 'pre': case 'section': case 'ul':
1548 /* If the stack of open elements has an element in scope
1549 with the same tag name as that of the token, then generate
1550 implied end tags. */
1551 if ($this->elementInScope($token['name'])) {
1552 $this->generateImpliedEndTags();
1554 /* Now, if the current node is not an element with
1555 the same tag name as that of the token, then this
1556 is a parse error. */
1557 // XERROR: implement parse error logic
1559 /* If the stack of open elements has an element in
1560 scope with the same tag name as that of the token,
1561 then pop elements from this stack until an element
1562 with that tag name has been popped from the stack. */
1563 do {
1564 $node = array_pop($this->stack);
1565 } while ($node->tagName !== $token['name']);
1566 } else {
1567 // parse error
1569 break;
1571 /* An end tag whose tag name is "form" */
1572 case 'form':
1573 /* Let node be the element that the form element pointer is set to. */
1574 $node = $this->form_pointer;
1575 /* Set the form element pointer to null. */
1576 $this->form_pointer = null;
1577 /* If node is null or the stack of open elements does not
1578 * have node in scope, then this is a parse error; ignore the token. */
1579 if ($node === null || !in_array($node, $this->stack)) {
1580 // parse error
1581 $this->ignored = true;
1582 } else {
1583 /* 1. Generate implied end tags. */
1584 $this->generateImpliedEndTags();
1585 /* 2. If the current node is not node, then this is a parse error. */
1586 if (end($this->stack) !== $node) {
1587 // parse error
1589 /* 3. Remove node from the stack of open elements. */
1590 array_splice($this->stack, array_search($node, $this->stack, true), 1);
1593 break;
1595 /* An end tag whose tag name is "p" */
1596 case 'p':
1597 /* If the stack of open elements has a p element in scope,
1598 then generate implied end tags, except for p elements. */
1599 if ($this->elementInScope('p')) {
1600 /* Generate implied end tags, except for elements with
1601 * the same tag name as the token. */
1602 $this->generateImpliedEndTags(array('p'));
1604 /* If the current node is not a p element, then this is
1605 a parse error. */
1606 // XERROR: implement
1608 /* Pop elements from the stack of open elements until
1609 * an element with the same tag name as the token has
1610 * been popped from the stack. */
1611 do {
1612 $node = array_pop($this->stack);
1613 } while ($node->tagName !== 'p');
1615 } else {
1616 // parse error
1617 $this->emitToken(array(
1618 'name' => 'p',
1619 'type' => HTML5_Tokenizer::STARTTAG,
1621 $this->emitToken($token);
1623 break;
1625 /* An end tag whose tag name is "li" */
1626 case 'li':
1627 /* If the stack of open elements does not have an element
1628 * in list item scope with the same tag name as that of the
1629 * token, then this is a parse error; ignore the token. */
1630 if ($this->elementInScope($token['name'], self::SCOPE_LISTITEM)) {
1631 /* Generate implied end tags, except for elements with the
1632 * same tag name as the token. */
1633 $this->generateImpliedEndTags(array($token['name']));
1634 /* If the current node is not an element with the same tag
1635 * name as that of the token, then this is a parse error. */
1636 // XERROR: parse error
1637 /* Pop elements from the stack of open elements until an
1638 * element with the same tag name as the token has been
1639 * popped from the stack. */
1640 do {
1641 $node = array_pop($this->stack);
1642 } while ($node->tagName !== $token['name']);
1644 /*else {
1645 // XERROR: parse error
1647 break;
1649 /* An end tag whose tag name is "dc", "dd", "ds", "dt" */
1650 case 'dc': case 'dd': case 'ds': case 'dt':
1651 if ($this->elementInScope($token['name'])) {
1652 $this->generateImpliedEndTags(array($token['name']));
1654 /* If the current node is not an element with the same
1655 tag name as the token, then this is a parse error. */
1656 // XERROR: implement parse error
1658 /* Pop elements from the stack of open elements until
1659 * an element with the same tag name as the token has
1660 * been popped from the stack. */
1661 do {
1662 $node = array_pop($this->stack);
1663 } while ($node->tagName !== $token['name']);
1665 /*else {
1666 // XERROR: parse error
1668 break;
1670 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
1671 "h5", "h6" */
1672 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1673 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
1675 /* If the stack of open elements has in scope an element whose
1676 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1677 generate implied end tags. */
1678 if ($this->elementInScope($elements)) {
1679 $this->generateImpliedEndTags();
1681 /* Now, if the current node is not an element with the same
1682 tag name as that of the token, then this is a parse error. */
1683 // XERROR: implement parse error
1685 /* If the stack of open elements has in scope an element
1686 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
1687 "h6", then pop elements from the stack until an element
1688 with one of those tag names has been popped from the stack. */
1689 do {
1690 $node = array_pop($this->stack);
1691 } while (!in_array($node->tagName, $elements));
1693 /*else {
1694 // parse error
1696 break;
1698 /* An end tag whose tag name is one of: "a", "b", "big", "em",
1699 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1700 case 'a': case 'b': case 'big': case 'code': case 'em': case 'font':
1701 case 'i': case 'nobr': case 's': case 'small': case 'strike':
1702 case 'strong': case 'tt': case 'u':
1703 // XERROR: generally speaking this needs parse error logic
1704 /* 1. Let the formatting element be the last element in
1705 the list of active formatting elements that:
1706 * is between the end of the list and the last scope
1707 marker in the list, if any, or the start of the list
1708 otherwise, and
1709 * has the same tag name as the token.
1711 while (true) {
1712 for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
1713 if ($this->a_formatting[$a] === self::MARKER) {
1714 break;
1715 } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
1716 $formatting_element = $this->a_formatting[$a];
1717 $in_stack = in_array($formatting_element, $this->stack, true);
1718 $fe_af_pos = $a;
1719 break;
1723 /* If there is no such node, or, if that node is
1724 also in the stack of open elements but the element
1725 is not in scope, then this is a parse error. Abort
1726 these steps. The token is ignored. */
1727 if (
1728 !isset($formatting_element) || (
1729 $in_stack &&
1730 !$this->elementInScope($token['name'])
1733 $this->ignored = true;
1734 break;
1736 /* Otherwise, if there is such a node, but that node
1737 is not in the stack of open elements, then this is a
1738 parse error; remove the element from the list, and
1739 abort these steps. */
1740 } elseif (isset($formatting_element) && !$in_stack) {
1741 unset($this->a_formatting[$fe_af_pos]);
1742 $this->a_formatting = array_merge($this->a_formatting);
1743 break;
1746 /* Otherwise, there is a formatting element and that
1747 * element is in the stack and is in scope. If the
1748 * element is not the current node, this is a parse
1749 * error. In any case, proceed with the algorithm as
1750 * written in the following steps. */
1751 // XERROR: implement me
1753 /* 2. Let the furthest block be the topmost node in the
1754 stack of open elements that is lower in the stack
1755 than the formatting element, and is not an element in
1756 the phrasing or formatting categories. There might
1757 not be one. */
1758 $fe_s_pos = array_search($formatting_element, $this->stack, true);
1759 $length = count($this->stack);
1761 for ($s = $fe_s_pos + 1; $s < $length; $s++) {
1762 $category = $this->getElementCategory($this->stack[$s]);
1764 if ($category !== self::PHRASING && $category !== self::FORMATTING) {
1765 $furthest_block = $this->stack[$s];
1766 break;
1770 /* 3. If there is no furthest block, then the UA must
1771 skip the subsequent steps and instead just pop all
1772 the nodes from the bottom of the stack of open
1773 elements, from the current node up to the formatting
1774 element, and remove the formatting element from the
1775 list of active formatting elements. */
1776 if (!isset($furthest_block)) {
1777 for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
1778 array_pop($this->stack);
1781 unset($this->a_formatting[$fe_af_pos]);
1782 $this->a_formatting = array_merge($this->a_formatting);
1783 break;
1786 /* 4. Let the common ancestor be the element
1787 immediately above the formatting element in the stack
1788 of open elements. */
1789 $common_ancestor = $this->stack[$fe_s_pos - 1];
1791 /* 5. Let a bookmark note the position of the
1792 formatting element in the list of active formatting
1793 elements relative to the elements on either side
1794 of it in the list. */
1795 $bookmark = $fe_af_pos;
1797 /* 6. Let node and last node be the furthest block.
1798 Follow these steps: */
1799 $node = $furthest_block;
1800 $last_node = $furthest_block;
1802 while (true) {
1803 for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
1804 /* 6.1 Let node be the element immediately
1805 prior to node in the stack of open elements. */
1806 $node = $this->stack[$n];
1808 /* 6.2 If node is not in the list of active
1809 formatting elements, then remove node from
1810 the stack of open elements and then go back
1811 to step 1. */
1812 if (!in_array($node, $this->a_formatting, true)) {
1813 array_splice($this->stack, $n, 1);
1814 } else {
1815 break;
1819 /* 6.3 Otherwise, if node is the formatting
1820 element, then go to the next step in the overall
1821 algorithm. */
1822 if ($node === $formatting_element) {
1823 break;
1825 /* 6.4 Otherwise, if last node is the furthest
1826 block, then move the aforementioned bookmark to
1827 be immediately after the node in the list of
1828 active formatting elements. */
1829 } elseif ($last_node === $furthest_block) {
1830 $bookmark = array_search($node, $this->a_formatting, true) + 1;
1833 /* 6.5 Create an element for the token for which
1834 * the element node was created, replace the entry
1835 * for node in the list of active formatting
1836 * elements with an entry for the new element,
1837 * replace the entry for node in the stack of open
1838 * elements with an entry for the new element, and
1839 * let node be the new element. */
1840 // we don't know what the token is anymore
1841 // XDOM
1842 $clone = $node->cloneNode();
1843 $a_pos = array_search($node, $this->a_formatting, true);
1844 $s_pos = array_search($node, $this->stack, true);
1845 $this->a_formatting[$a_pos] = $clone;
1846 $this->stack[$s_pos] = $clone;
1847 $node = $clone;
1849 /* 6.6 Insert last node into node, first removing
1850 it from its previous parent node if any. */
1851 // XDOM
1852 if ($last_node->parentNode !== null) {
1853 $last_node->parentNode->removeChild($last_node);
1856 // XDOM
1857 $node->appendChild($last_node);
1859 /* 6.7 Let last node be node. */
1860 $last_node = $node;
1862 /* 6.8 Return to step 1 of this inner set of steps. */
1865 /* 7. If the common ancestor node is a table, tbody,
1866 * tfoot, thead, or tr element, then, foster parent
1867 * whatever last node ended up being in the previous
1868 * step, first removing it from its previous parent
1869 * node if any. */
1870 // XDOM
1871 if ($last_node->parentNode) { // common step
1872 $last_node->parentNode->removeChild($last_node);
1874 if (in_array($common_ancestor->tagName, array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
1875 $this->fosterParent($last_node);
1876 /* Otherwise, append whatever last node ended up being
1877 * in the previous step to the common ancestor node,
1878 * first removing it from its previous parent node if
1879 * any. */
1880 } else {
1881 // XDOM
1882 $common_ancestor->appendChild($last_node);
1885 /* 8. Create an element for the token for which the
1886 * formatting element was created. */
1887 // XDOM
1888 $clone = $formatting_element->cloneNode();
1890 /* 9. Take all of the child nodes of the furthest
1891 block and append them to the element created in the
1892 last step. */
1893 // XDOM
1894 while ($furthest_block->hasChildNodes()) {
1895 $child = $furthest_block->firstChild;
1896 $furthest_block->removeChild($child);
1897 $clone->appendChild($child);
1900 /* 10. Append that clone to the furthest block. */
1901 // XDOM
1902 $furthest_block->appendChild($clone);
1904 /* 11. Remove the formatting element from the list
1905 of active formatting elements, and insert the new element
1906 into the list of active formatting elements at the
1907 position of the aforementioned bookmark. */
1908 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
1909 array_splice($this->a_formatting, $fe_af_pos, 1);
1911 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
1912 $af_part2 = array_slice($this->a_formatting, $bookmark);
1913 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
1915 /* 12. Remove the formatting element from the stack
1916 of open elements, and insert the new element into the stack
1917 of open elements immediately below the position of the
1918 furthest block in that stack. */
1919 $fe_s_pos = array_search($formatting_element, $this->stack, true);
1920 array_splice($this->stack, $fe_s_pos, 1);
1922 $fb_s_pos = array_search($furthest_block, $this->stack, true);
1923 $s_part1 = array_slice($this->stack, 0, $fb_s_pos + 1);
1924 $s_part2 = array_slice($this->stack, $fb_s_pos + 1);
1925 $this->stack = array_merge($s_part1, array($clone), $s_part2);
1927 /* 13. Jump back to step 1 in this series of steps. */
1928 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
1930 break;
1932 case 'applet': case 'button': case 'marquee': case 'object':
1933 /* If the stack of open elements has an element in scope whose
1934 tag name matches the tag name of the token, then generate implied
1935 tags. */
1936 if ($this->elementInScope($token['name'])) {
1937 $this->generateImpliedEndTags();
1939 /* Now, if the current node is not an element with the same
1940 tag name as the token, then this is a parse error. */
1941 // XERROR: implement logic
1943 /* Pop elements from the stack of open elements until
1944 * an element with the same tag name as the token has
1945 * been popped from the stack. */
1946 do {
1947 $node = array_pop($this->stack);
1948 } while ($node->tagName !== $token['name']);
1950 /* Clear the list of active formatting elements up to the
1951 * last marker. */
1952 $keys = array_keys($this->a_formatting, self::MARKER, true);
1953 $marker = end($keys);
1955 for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
1956 array_pop($this->a_formatting);
1959 /*else {
1960 // parse error
1962 break;
1964 case 'br':
1965 // Parse error
1966 $this->emitToken(array(
1967 'name' => 'br',
1968 'type' => HTML5_Tokenizer::STARTTAG,
1970 break;
1972 /* An end tag token not covered by the previous entries */
1973 default:
1974 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
1975 /* Initialise node to be the current node (the bottommost
1976 node of the stack). */
1977 $node = $this->stack[$n];
1979 /* If node has the same tag name as the end tag token,
1980 then: */
1981 if ($token['name'] === $node->tagName) {
1982 /* Generate implied end tags. */
1983 $this->generateImpliedEndTags();
1985 /* If the tag name of the end tag token does not
1986 match the tag name of the current node, this is a
1987 parse error. */
1988 // XERROR: implement this
1990 /* Pop all the nodes from the current node up to
1991 node, including node, then stop these steps. */
1992 // XSKETCHY
1993 do {
1994 $pop = array_pop($this->stack);
1995 } while ($pop !== $node);
1996 break;
1997 } else {
1998 $category = $this->getElementCategory($node);
2000 if ($category !== self::FORMATTING && $category !== self::PHRASING) {
2001 /* Otherwise, if node is in neither the formatting
2002 category nor the phrasing category, then this is a
2003 parse error. Stop this algorithm. The end tag token
2004 is ignored. */
2005 $this->ignored = true;
2006 break;
2007 // parse error
2010 /* Set node to the previous entry in the stack of open elements. Loop. */
2012 break;
2014 break;
2016 break;
2018 case self::IN_CDATA_RCDATA:
2019 if (
2020 $token['type'] === HTML5_Tokenizer::CHARACTER ||
2021 $token['type'] === HTML5_Tokenizer::SPACECHARACTER
2023 $this->insertText($token['data']);
2024 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
2025 // parse error
2026 /* If the current node is a script element, mark the script
2027 * element as "already executed". */
2028 // probably not necessary
2029 array_pop($this->stack);
2030 $this->mode = $this->original_mode;
2031 $this->emitToken($token);
2032 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'script') {
2033 array_pop($this->stack);
2034 $this->mode = $this->original_mode;
2035 // we're ignoring all of the execution stuff
2036 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG) {
2037 array_pop($this->stack);
2038 $this->mode = $this->original_mode;
2040 break;
2042 case self::IN_TABLE:
2043 $clear = array('html', 'table');
2045 /* A character token */
2046 if ($token['type'] === HTML5_Tokenizer::CHARACTER ||
2047 $token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
2048 /* Let the pending table character tokens
2049 * be an empty list of tokens. */
2050 $this->pendingTableCharacters = "";
2051 $this->pendingTableCharactersDirty = false;
2052 /* Let the original insertion mode be the current
2053 * insertion mode. */
2054 $this->original_mode = $this->mode;
2055 /* Switch the insertion mode to
2056 * "in table text" and
2057 * reprocess the token. */
2058 $this->mode = self::IN_TABLE_TEXT;
2059 $this->emitToken($token);
2061 /* A comment token */
2062 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
2063 /* Append a Comment node to the current node with the data
2064 attribute set to the data given in the comment token. */
2065 $this->insertComment($token['data']);
2067 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
2068 // parse error
2070 /* A start tag whose tag name is "caption" */
2071 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2072 $token['name'] === 'caption') {
2073 /* Clear the stack back to a table context. */
2074 $this->clearStackToTableContext($clear);
2076 /* Insert a marker at the end of the list of active
2077 formatting elements. */
2078 $this->a_formatting[] = self::MARKER;
2080 /* Insert an HTML element for the token, then switch the
2081 insertion mode to "in caption". */
2082 $this->insertElement($token);
2083 $this->mode = self::IN_CAPTION;
2085 /* A start tag whose tag name is "colgroup" */
2086 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2087 $token['name'] === 'colgroup') {
2088 /* Clear the stack back to a table context. */
2089 $this->clearStackToTableContext($clear);
2091 /* Insert an HTML element for the token, then switch the
2092 insertion mode to "in column group". */
2093 $this->insertElement($token);
2094 $this->mode = self::IN_COLUMN_GROUP;
2096 /* A start tag whose tag name is "col" */
2097 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2098 $token['name'] === 'col') {
2099 $this->emitToken(array(
2100 'name' => 'colgroup',
2101 'type' => HTML5_Tokenizer::STARTTAG,
2102 'attr' => array()
2105 $this->emitToken($token);
2107 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2108 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
2109 array('tbody', 'tfoot', 'thead'))) {
2110 /* Clear the stack back to a table context. */
2111 $this->clearStackToTableContext($clear);
2113 /* Insert an HTML element for the token, then switch the insertion
2114 mode to "in table body". */
2115 $this->insertElement($token);
2116 $this->mode = self::IN_TABLE_BODY;
2118 /* A start tag whose tag name is one of: "td", "th", "tr" */
2119 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2120 in_array($token['name'], array('td', 'th', 'tr'))) {
2121 /* Act as if a start tag token with the tag name "tbody" had been
2122 seen, then reprocess the current token. */
2123 $this->emitToken(array(
2124 'name' => 'tbody',
2125 'type' => HTML5_Tokenizer::STARTTAG,
2126 'attr' => array()
2129 $this->emitToken($token);
2131 /* A start tag whose tag name is "table" */
2132 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2133 $token['name'] === 'table') {
2134 /* Parse error. Act as if an end tag token with the tag name "table"
2135 had been seen, then, if that token wasn't ignored, reprocess the
2136 current token. */
2137 $this->emitToken(array(
2138 'name' => 'table',
2139 'type' => HTML5_Tokenizer::ENDTAG
2142 if (!$this->ignored) {
2143 $this->emitToken($token);
2146 /* An end tag whose tag name is "table" */
2147 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2148 $token['name'] === 'table') {
2149 /* If the stack of open elements does not have an element in table
2150 scope with the same tag name as the token, this is a parse error.
2151 Ignore the token. (fragment case) */
2152 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2153 $this->ignored = true;
2154 } else {
2155 do {
2156 $node = array_pop($this->stack);
2157 } while ($node->tagName !== 'table');
2159 /* Reset the insertion mode appropriately. */
2160 $this->resetInsertionMode();
2163 /* An end tag whose tag name is one of: "body", "caption", "col",
2164 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2165 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
2166 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2167 'tfoot', 'th', 'thead', 'tr'))) {
2168 // Parse error. Ignore the token.
2170 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2171 ($token['name'] === 'style' || $token['name'] === 'script')) {
2172 $this->processWithRulesFor($token, self::IN_HEAD);
2174 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'input' &&
2175 // assignment is intentional
2176 /* If the token does not have an attribute with the name "type", or
2177 * if it does, but that attribute's value is not an ASCII
2178 * case-insensitive match for the string "hidden", then: act as
2179 * described in the "anything else" entry below. */
2180 ($type = $this->getAttr($token, 'type')) && strtolower($type) === 'hidden') {
2181 // I.e., if its an input with the type attribute == 'hidden'
2182 /* Otherwise */
2183 // parse error
2184 $this->insertElement($token);
2185 array_pop($this->stack);
2186 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
2187 /* If the current node is not the root html element, then this is a parse error. */
2188 if (end($this->stack)->tagName !== 'html') {
2189 // Note: It can only be the current node in the fragment case.
2190 // parse error
2192 /* Stop parsing. */
2193 /* Anything else */
2194 } else {
2195 /* Parse error. Process the token as if the insertion mode was "in
2196 body", with the following exception: */
2198 $old = $this->foster_parent;
2199 $this->foster_parent = true;
2200 $this->processWithRulesFor($token, self::IN_BODY);
2201 $this->foster_parent = $old;
2203 break;
2205 case self::IN_TABLE_TEXT:
2206 /* A character token */
2207 if ($token['type'] === HTML5_Tokenizer::CHARACTER) {
2208 /* Append the character token to the pending table
2209 * character tokens list. */
2210 $this->pendingTableCharacters .= $token['data'];
2211 $this->pendingTableCharactersDirty = true;
2212 } elseif ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
2213 $this->pendingTableCharacters .= $token['data'];
2214 /* Anything else */
2215 } else {
2216 if ($this->pendingTableCharacters !== '' && is_string($this->pendingTableCharacters)) {
2217 /* If any of the tokens in the pending table character tokens list
2218 * are character tokens that are not one of U+0009 CHARACTER
2219 * TABULATION, U+000A LINE FEED (LF), U+000C FORM FEED (FF), or
2220 * U+0020 SPACE, then reprocess those character tokens using the
2221 * rules given in the "anything else" entry in the in table"
2222 * insertion mode.*/
2223 if ($this->pendingTableCharactersDirty) {
2224 /* Parse error. Process the token using the rules for the
2225 * "in body" insertion mode, except that if the current
2226 * node is a table, tbody, tfoot, thead, or tr element,
2227 * then, whenever a node would be inserted into the current
2228 * node, it must instead be foster parented. */
2229 // XERROR
2230 $old = $this->foster_parent;
2231 $this->foster_parent = true;
2232 $text_token = array(
2233 'type' => HTML5_Tokenizer::CHARACTER,
2234 'data' => $this->pendingTableCharacters,
2236 $this->processWithRulesFor($text_token, self::IN_BODY);
2237 $this->foster_parent = $old;
2239 /* Otherwise, insert the characters given by the pending table
2240 * character tokens list into the current node. */
2241 } else {
2242 $this->insertText($this->pendingTableCharacters);
2244 $this->pendingTableCharacters = null;
2245 $this->pendingTableCharactersNull = null;
2248 /* Switch the insertion mode to the original insertion mode and
2249 * reprocess the token.
2251 $this->mode = $this->original_mode;
2252 $this->emitToken($token);
2254 break;
2256 case self::IN_CAPTION:
2257 /* An end tag whose tag name is "caption" */
2258 if ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'caption') {
2259 /* If the stack of open elements does not have an element in table
2260 scope with the same tag name as the token, this is a parse error.
2261 Ignore the token. (fragment case) */
2262 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2263 $this->ignored = true;
2264 // Ignore
2266 /* Otherwise: */
2267 } else {
2268 /* Generate implied end tags. */
2269 $this->generateImpliedEndTags();
2271 /* Now, if the current node is not a caption element, then this
2272 is a parse error. */
2273 // XERROR: implement
2275 /* Pop elements from this stack until a caption element has
2276 been popped from the stack. */
2277 do {
2278 $node = array_pop($this->stack);
2279 } while ($node->tagName !== 'caption');
2281 /* Clear the list of active formatting elements up to the last
2282 marker. */
2283 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2285 /* Switch the insertion mode to "in table". */
2286 $this->mode = self::IN_TABLE;
2289 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2290 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2291 name is "table" */
2292 } elseif (($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
2293 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2294 'thead', 'tr'))) || ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2295 $token['name'] === 'table')) {
2296 /* Parse error. Act as if an end tag with the tag name "caption"
2297 had been seen, then, if that token wasn't ignored, reprocess the
2298 current token. */
2299 $this->emitToken(array(
2300 'name' => 'caption',
2301 'type' => HTML5_Tokenizer::ENDTAG
2304 if (!$this->ignored) {
2305 $this->emitToken($token);
2308 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2309 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2310 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
2311 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2312 'thead', 'tr'))) {
2313 // Parse error. Ignore the token.
2314 $this->ignored = true;
2315 } else {
2316 /* Process the token as if the insertion mode was "in body". */
2317 $this->processWithRulesFor($token, self::IN_BODY);
2319 break;
2321 case self::IN_COLUMN_GROUP:
2322 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2323 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2324 or U+0020 SPACE */
2325 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
2326 /* Append the character to the current node. */
2327 $this->insertText($token['data']);
2329 /* A comment token */
2330 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
2331 /* Append a Comment node to the current node with the data
2332 attribute set to the data given in the comment token. */
2333 $this->insertComment($token['data']);
2334 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
2335 // parse error
2336 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
2337 $this->processWithRulesFor($token, self::IN_BODY);
2339 /* A start tag whose tag name is "col" */
2340 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'col') {
2341 /* Insert a col element for the token. Immediately pop the current
2342 node off the stack of open elements. */
2343 $this->insertElement($token);
2344 array_pop($this->stack);
2345 // XERROR: Acknowledge the token's self-closing flag, if it is set.
2347 /* An end tag whose tag name is "colgroup" */
2348 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2349 $token['name'] === 'colgroup') {
2350 /* If the current node is the root html element, then this is a
2351 parse error, ignore the token. (fragment case) */
2352 if (end($this->stack)->tagName === 'html') {
2353 $this->ignored = true;
2355 /* Otherwise, pop the current node (which will be a colgroup
2356 element) from the stack of open elements. Switch the insertion
2357 mode to "in table". */
2358 } else {
2359 array_pop($this->stack);
2360 $this->mode = self::IN_TABLE;
2363 /* An end tag whose tag name is "col" */
2364 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'col') {
2365 /* Parse error. Ignore the token. */
2366 $this->ignored = true;
2368 /* An end-of-file token */
2369 /* If the current node is the root html element */
2370 } elseif ($token['type'] === HTML5_Tokenizer::EOF && end($this->stack)->tagName === 'html') {
2371 /* Stop parsing */
2373 /* Anything else */
2374 } else {
2375 /* Act as if an end tag with the tag name "colgroup" had been seen,
2376 and then, if that token wasn't ignored, reprocess the current token. */
2377 $this->emitToken(array(
2378 'name' => 'colgroup',
2379 'type' => HTML5_Tokenizer::ENDTAG
2382 if (!$this->ignored) {
2383 $this->emitToken($token);
2386 break;
2388 case self::IN_TABLE_BODY:
2389 $clear = array('tbody', 'tfoot', 'thead', 'html');
2391 /* A start tag whose tag name is "tr" */
2392 if ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'tr') {
2393 /* Clear the stack back to a table body context. */
2394 $this->clearStackToTableContext($clear);
2396 /* Insert a tr element for the token, then switch the insertion
2397 mode to "in row". */
2398 $this->insertElement($token);
2399 $this->mode = self::IN_ROW;
2401 /* A start tag whose tag name is one of: "th", "td" */
2402 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2403 ($token['name'] === 'th' || $token['name'] === 'td')) {
2404 /* Parse error. Act as if a start tag with the tag name "tr" had
2405 been seen, then reprocess the current token. */
2406 $this->emitToken(array(
2407 'name' => 'tr',
2408 'type' => HTML5_Tokenizer::STARTTAG,
2409 'attr' => array()
2412 $this->emitToken($token);
2414 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2415 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2416 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2417 /* If the stack of open elements does not have an element in table
2418 scope with the same tag name as the token, this is a parse error.
2419 Ignore the token. */
2420 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2421 // Parse error
2422 $this->ignored = true;
2424 /* Otherwise: */
2425 } else {
2426 /* Clear the stack back to a table body context. */
2427 $this->clearStackToTableContext($clear);
2429 /* Pop the current node from the stack of open elements. Switch
2430 the insertion mode to "in table". */
2431 array_pop($this->stack);
2432 $this->mode = self::IN_TABLE;
2435 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2436 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2437 } elseif (($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
2438 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead'))) ||
2439 ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) {
2440 /* If the stack of open elements does not have a tbody, thead, or
2441 tfoot element in table scope, this is a parse error. Ignore the
2442 token. (fragment case) */
2443 if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), self::SCOPE_TABLE)) {
2444 // parse error
2445 $this->ignored = true;
2447 /* Otherwise: */
2448 } else {
2449 /* Clear the stack back to a table body context. */
2450 $this->clearStackToTableContext($clear);
2452 /* Act as if an end tag with the same tag name as the current
2453 node ("tbody", "tfoot", or "thead") had been seen, then
2454 reprocess the current token. */
2455 $this->emitToken(array(
2456 'name' => end($this->stack)->tagName,
2457 'type' => HTML5_Tokenizer::ENDTAG
2460 $this->emitToken($token);
2463 /* An end tag whose tag name is one of: "body", "caption", "col",
2464 "colgroup", "html", "td", "th", "tr" */
2465 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
2466 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
2467 /* Parse error. Ignore the token. */
2468 $this->ignored = true;
2470 /* Anything else */
2471 } else {
2472 /* Process the token as if the insertion mode was "in table". */
2473 $this->processWithRulesFor($token, self::IN_TABLE);
2475 break;
2477 case self::IN_ROW:
2478 $clear = array('tr', 'html');
2480 /* A start tag whose tag name is one of: "th", "td" */
2481 if ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2482 ($token['name'] === 'th' || $token['name'] === 'td')) {
2483 /* Clear the stack back to a table row context. */
2484 $this->clearStackToTableContext($clear);
2486 /* Insert an HTML element for the token, then switch the insertion
2487 mode to "in cell". */
2488 $this->insertElement($token);
2489 $this->mode = self::IN_CELL;
2491 /* Insert a marker at the end of the list of active formatting
2492 elements. */
2493 $this->a_formatting[] = self::MARKER;
2495 /* An end tag whose tag name is "tr" */
2496 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'tr') {
2497 /* If the stack of open elements does not have an element in table
2498 scope with the same tag name as the token, this is a parse error.
2499 Ignore the token. (fragment case) */
2500 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2501 // Ignore.
2502 $this->ignored = true;
2503 } else {
2504 /* Clear the stack back to a table row context. */
2505 $this->clearStackToTableContext($clear);
2507 /* Pop the current node (which will be a tr element) from the
2508 stack of open elements. Switch the insertion mode to "in table
2509 body". */
2510 array_pop($this->stack);
2511 $this->mode = self::IN_TABLE_BODY;
2514 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2515 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
2516 } elseif (($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
2517 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) ||
2518 ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'table')) {
2519 /* Act as if an end tag with the tag name "tr" had been seen, then,
2520 if that token wasn't ignored, reprocess the current token. */
2521 $this->emitToken(array(
2522 'name' => 'tr',
2523 'type' => HTML5_Tokenizer::ENDTAG
2525 if (!$this->ignored) {
2526 $this->emitToken($token);
2529 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2530 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2531 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2532 /* If the stack of open elements does not have an element in table
2533 scope with the same tag name as the token, this is a parse error.
2534 Ignore the token. */
2535 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2536 $this->ignored = true;
2538 /* Otherwise: */
2539 } else {
2540 /* Otherwise, act as if an end tag with the tag name "tr" had
2541 been seen, then reprocess the current token. */
2542 $this->emitToken(array(
2543 'name' => 'tr',
2544 'type' => HTML5_Tokenizer::ENDTAG
2547 $this->emitToken($token);
2550 /* An end tag whose tag name is one of: "body", "caption", "col",
2551 "colgroup", "html", "td", "th" */
2552 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
2553 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th'))) {
2554 /* Parse error. Ignore the token. */
2555 $this->ignored = true;
2557 /* Anything else */
2558 } else {
2559 /* Process the token as if the insertion mode was "in table". */
2560 $this->processWithRulesFor($token, self::IN_TABLE);
2562 break;
2564 case self::IN_CELL:
2565 /* An end tag whose tag name is one of: "td", "th" */
2566 if ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2567 ($token['name'] === 'td' || $token['name'] === 'th')) {
2568 /* If the stack of open elements does not have an element in table
2569 scope with the same tag name as that of the token, then this is a
2570 parse error and the token must be ignored. */
2571 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2572 $this->ignored = true;
2574 /* Otherwise: */
2575 } else {
2576 /* Generate implied end tags, except for elements with the same
2577 tag name as the token. */
2578 $this->generateImpliedEndTags(array($token['name']));
2580 /* Now, if the current node is not an element with the same tag
2581 name as the token, then this is a parse error. */
2582 // XERROR: Implement parse error code
2584 /* Pop elements from this stack until an element with the same
2585 tag name as the token has been popped from the stack. */
2586 do {
2587 $node = array_pop($this->stack);
2588 } while ($node->tagName !== $token['name']);
2590 /* Clear the list of active formatting elements up to the last
2591 marker. */
2592 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2594 /* Switch the insertion mode to "in row". (The current node
2595 will be a tr element at this point.) */
2596 $this->mode = self::IN_ROW;
2599 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2600 "tbody", "td", "tfoot", "th", "thead", "tr" */
2601 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && in_array($token['name'],
2602 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2603 'thead', 'tr'))) {
2604 /* If the stack of open elements does not have a td or th element
2605 in table scope, then this is a parse error; ignore the token.
2606 (fragment case) */
2607 if (!$this->elementInScope(array('td', 'th'), self::SCOPE_TABLE)) {
2608 // parse error
2609 $this->ignored = true;
2611 /* Otherwise, close the cell (see below) and reprocess the current
2612 token. */
2613 } else {
2614 $this->closeCell();
2615 $this->emitToken($token);
2618 /* An end tag whose tag name is one of: "body", "caption", "col",
2619 "colgroup", "html" */
2620 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
2621 array('body', 'caption', 'col', 'colgroup', 'html'))) {
2622 /* Parse error. Ignore the token. */
2623 $this->ignored = true;
2625 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
2626 "thead", "tr" */
2627 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && in_array($token['name'],
2628 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2629 /* If the stack of open elements does not have a td or th element
2630 in table scope, then this is a parse error; ignore the token.
2631 (innerHTML case) */
2632 if (!$this->elementInScope(array('td', 'th'), self::SCOPE_TABLE)) {
2633 // Parse error
2634 $this->ignored = true;
2636 /* Otherwise, close the cell (see below) and reprocess the current
2637 token. */
2638 } else {
2639 $this->closeCell();
2640 $this->emitToken($token);
2643 /* Anything else */
2644 } else {
2645 /* Process the token as if the insertion mode was "in body". */
2646 $this->processWithRulesFor($token, self::IN_BODY);
2648 break;
2650 case self::IN_SELECT:
2651 /* Handle the token as follows: */
2653 /* A character token */
2654 if (
2655 $token['type'] === HTML5_Tokenizer::CHARACTER ||
2656 $token['type'] === HTML5_Tokenizer::SPACECHARACTER
2658 /* Append the token's character to the current node. */
2659 $this->insertText($token['data']);
2661 /* A comment token */
2662 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
2663 /* Append a Comment node to the current node with the data
2664 attribute set to the data given in the comment token. */
2665 $this->insertComment($token['data']);
2667 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
2668 // parse error
2670 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
2671 $this->processWithRulesFor($token, self::IN_BODY);
2673 /* A start tag token whose tag name is "option" */
2674 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2675 $token['name'] === 'option') {
2676 /* If the current node is an option element, act as if an end tag
2677 with the tag name "option" had been seen. */
2678 if (end($this->stack)->tagName === 'option') {
2679 $this->emitToken(array(
2680 'name' => 'option',
2681 'type' => HTML5_Tokenizer::ENDTAG
2685 /* Insert an HTML element for the token. */
2686 $this->insertElement($token);
2688 /* A start tag token whose tag name is "optgroup" */
2689 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2690 $token['name'] === 'optgroup') {
2691 /* If the current node is an option element, act as if an end tag
2692 with the tag name "option" had been seen. */
2693 if (end($this->stack)->tagName === 'option') {
2694 $this->emitToken(array(
2695 'name' => 'option',
2696 'type' => HTML5_Tokenizer::ENDTAG
2700 /* If the current node is an optgroup element, act as if an end tag
2701 with the tag name "optgroup" had been seen. */
2702 if (end($this->stack)->tagName === 'optgroup') {
2703 $this->emitToken(array(
2704 'name' => 'optgroup',
2705 'type' => HTML5_Tokenizer::ENDTAG
2709 /* Insert an HTML element for the token. */
2710 $this->insertElement($token);
2712 /* An end tag token whose tag name is "optgroup" */
2713 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2714 $token['name'] === 'optgroup') {
2715 /* First, if the current node is an option element, and the node
2716 immediately before it in the stack of open elements is an optgroup
2717 element, then act as if an end tag with the tag name "option" had
2718 been seen. */
2719 $elements_in_stack = count($this->stack);
2721 if ($this->stack[$elements_in_stack - 1]->tagName === 'option' &&
2722 $this->stack[$elements_in_stack - 2]->tagName === 'optgroup') {
2723 $this->emitToken(array(
2724 'name' => 'option',
2725 'type' => HTML5_Tokenizer::ENDTAG
2729 /* If the current node is an optgroup element, then pop that node
2730 from the stack of open elements. Otherwise, this is a parse error,
2731 ignore the token. */
2732 if (end($this->stack)->tagName === 'optgroup') {
2733 array_pop($this->stack);
2734 } else {
2735 // parse error
2736 $this->ignored = true;
2739 /* An end tag token whose tag name is "option" */
2740 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2741 $token['name'] === 'option') {
2742 /* If the current node is an option element, then pop that node
2743 from the stack of open elements. Otherwise, this is a parse error,
2744 ignore the token. */
2745 if (end($this->stack)->tagName === 'option') {
2746 array_pop($this->stack);
2747 } else {
2748 // parse error
2749 $this->ignored = true;
2752 /* An end tag whose tag name is "select" */
2753 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2754 $token['name'] === 'select') {
2755 /* If the stack of open elements does not have an element in table
2756 scope with the same tag name as the token, this is a parse error.
2757 Ignore the token. (fragment case) */
2758 if (!$this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2759 $this->ignored = true;
2760 // parse error
2762 /* Otherwise: */
2763 } else {
2764 /* Pop elements from the stack of open elements until a select
2765 element has been popped from the stack. */
2766 do {
2767 $node = array_pop($this->stack);
2768 } while ($node->tagName !== 'select');
2770 /* Reset the insertion mode appropriately. */
2771 $this->resetInsertionMode();
2774 /* A start tag whose tag name is "select" */
2775 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'select') {
2776 /* Parse error. Act as if the token had been an end tag with the
2777 tag name "select" instead. */
2778 $this->emitToken(array(
2779 'name' => 'select',
2780 'type' => HTML5_Tokenizer::ENDTAG
2783 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2784 ($token['name'] === 'input' || $token['name'] === 'keygen' || $token['name'] === 'textarea')) {
2785 // parse error
2786 $this->emitToken(array(
2787 'name' => 'select',
2788 'type' => HTML5_Tokenizer::ENDTAG
2790 $this->emitToken($token);
2792 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'script') {
2793 $this->processWithRulesFor($token, self::IN_HEAD);
2795 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
2796 // XERROR: If the current node is not the root html element, then this is a parse error.
2797 /* Stop parsing */
2799 /* Anything else */
2800 } else {
2801 /* Parse error. Ignore the token. */
2802 $this->ignored = true;
2804 break;
2806 case self::IN_SELECT_IN_TABLE:
2808 if ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2809 in_array($token['name'], array('caption', 'table', 'tbody',
2810 'tfoot', 'thead', 'tr', 'td', 'th'))) {
2811 // parse error
2812 $this->emitToken(array(
2813 'name' => 'select',
2814 'type' => HTML5_Tokenizer::ENDTAG,
2816 $this->emitToken($token);
2818 /* An end tag whose tag name is one of: "caption", "table", "tbody",
2819 "tfoot", "thead", "tr", "td", "th" */
2820 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2821 in_array($token['name'], array('caption', 'table', 'tbody', 'tfoot', 'thead', 'tr', 'td', 'th'))) {
2822 /* Parse error. */
2823 // parse error
2825 /* If the stack of open elements has an element in table scope with
2826 the same tag name as that of the token, then act as if an end tag
2827 with the tag name "select" had been seen, and reprocess the token.
2828 Otherwise, ignore the token. */
2829 if ($this->elementInScope($token['name'], self::SCOPE_TABLE)) {
2830 $this->emitToken(array(
2831 'name' => 'select',
2832 'type' => HTML5_Tokenizer::ENDTAG
2835 $this->emitToken($token);
2836 } else {
2837 $this->ignored = true;
2839 } else {
2840 $this->processWithRulesFor($token, self::IN_SELECT);
2842 break;
2844 case self::IN_FOREIGN_CONTENT:
2845 if ($token['type'] === HTML5_Tokenizer::CHARACTER ||
2846 $token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
2847 $this->insertText($token['data']);
2848 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
2849 $this->insertComment($token['data']);
2850 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
2851 // XERROR: parse error
2852 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
2853 $token['name'] === 'script' && end($this->stack)->tagName === 'script' &&
2854 // XDOM
2855 end($this->stack)->namespaceURI === self::NS_SVG) {
2856 array_pop($this->stack);
2857 // a bunch of script running mumbo jumbo
2858 } elseif (
2859 ($token['type'] === HTML5_Tokenizer::STARTTAG &&
2861 $token['name'] !== 'mglyph' &&
2862 $token['name'] !== 'malignmark' &&
2863 // XDOM
2864 end($this->stack)->namespaceURI === self::NS_MATHML &&
2865 in_array(end($this->stack)->tagName, array('mi', 'mo', 'mn', 'ms', 'mtext'))
2866 ) ||
2868 $token['name'] === 'svg' &&
2869 // XDOM
2870 end($this->stack)->namespaceURI === self::NS_MATHML &&
2871 end($this->stack)->tagName === 'annotation-xml'
2872 ) ||
2874 // XDOM
2875 end($this->stack)->namespaceURI === self::NS_SVG &&
2876 in_array(end($this->stack)->tagName, array('foreignObject', 'desc', 'title'))
2877 ) ||
2879 // XSKETCHY && XDOM
2880 end($this->stack)->namespaceURI === self::NS_HTML
2882 ) || $token['type'] === HTML5_Tokenizer::ENDTAG
2884 $this->processWithRulesFor($token, $this->secondary_mode);
2885 /* If, after doing so, the insertion mode is still "in foreign
2886 * content", but there is no element in scope that has a namespace
2887 * other than the HTML namespace, switch the insertion mode to the
2888 * secondary insertion mode. */
2889 if ($this->mode === self::IN_FOREIGN_CONTENT) {
2890 $found = false;
2891 // this basically duplicates elementInScope()
2892 for ($i = count($this->stack) - 1; $i >= 0; $i--) {
2893 // XDOM
2894 $node = $this->stack[$i];
2895 if ($node->namespaceURI !== self::NS_HTML) {
2896 $found = true;
2897 break;
2898 } elseif (in_array($node->tagName, array('table', 'html',
2899 'applet', 'caption', 'td', 'th', 'button', 'marquee',
2900 'object')) || ($node->tagName === 'foreignObject' &&
2901 $node->namespaceURI === self::NS_SVG)) {
2902 break;
2905 if (!$found) {
2906 $this->mode = $this->secondary_mode;
2909 } elseif ($token['type'] === HTML5_Tokenizer::EOF || (
2910 $token['type'] === HTML5_Tokenizer::STARTTAG &&
2911 (in_array($token['name'], array('b', "big", "blockquote", "body", "br",
2912 "center", "code", "dc", "dd", "div", "dl", "ds", "dt", "em", "embed", "h1", "h2",
2913 "h3", "h4", "h5", "h6", "head", "hr", "i", "img", "li", "listing",
2914 "menu", "meta", "nobr", "ol", "p", "pre", "ruby", "s", "small",
2915 "span", "strong", "strike", "sub", "sup", "table", "tt", "u", "ul",
2916 "var")) || ($token['name'] === 'font' && ($this->getAttr($token, 'color') ||
2917 $this->getAttr($token, 'face') || $this->getAttr($token, 'size')))))) {
2918 // XERROR: parse error
2919 do {
2920 $node = array_pop($this->stack);
2921 // XDOM
2922 } while ($node->namespaceURI !== self::NS_HTML);
2923 $this->stack[] = $node;
2924 $this->mode = $this->secondary_mode;
2925 $this->emitToken($token);
2926 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG) {
2927 static $svg_lookup = array(
2928 'altglyph' => 'altGlyph',
2929 'altglyphdef' => 'altGlyphDef',
2930 'altglyphitem' => 'altGlyphItem',
2931 'animatecolor' => 'animateColor',
2932 'animatemotion' => 'animateMotion',
2933 'animatetransform' => 'animateTransform',
2934 'clippath' => 'clipPath',
2935 'feblend' => 'feBlend',
2936 'fecolormatrix' => 'feColorMatrix',
2937 'fecomponenttransfer' => 'feComponentTransfer',
2938 'fecomposite' => 'feComposite',
2939 'feconvolvematrix' => 'feConvolveMatrix',
2940 'fediffuselighting' => 'feDiffuseLighting',
2941 'fedisplacementmap' => 'feDisplacementMap',
2942 'fedistantlight' => 'feDistantLight',
2943 'feflood' => 'feFlood',
2944 'fefunca' => 'feFuncA',
2945 'fefuncb' => 'feFuncB',
2946 'fefuncg' => 'feFuncG',
2947 'fefuncr' => 'feFuncR',
2948 'fegaussianblur' => 'feGaussianBlur',
2949 'feimage' => 'feImage',
2950 'femerge' => 'feMerge',
2951 'femergenode' => 'feMergeNode',
2952 'femorphology' => 'feMorphology',
2953 'feoffset' => 'feOffset',
2954 'fepointlight' => 'fePointLight',
2955 'fespecularlighting' => 'feSpecularLighting',
2956 'fespotlight' => 'feSpotLight',
2957 'fetile' => 'feTile',
2958 'feturbulence' => 'feTurbulence',
2959 'foreignobject' => 'foreignObject',
2960 'glyphref' => 'glyphRef',
2961 'lineargradient' => 'linearGradient',
2962 'radialgradient' => 'radialGradient',
2963 'textpath' => 'textPath',
2965 // XDOM
2966 $current = end($this->stack);
2967 if ($current->namespaceURI === self::NS_MATHML) {
2968 $token = $this->adjustMathMLAttributes($token);
2970 if ($current->namespaceURI === self::NS_SVG &&
2971 isset($svg_lookup[$token['name']])) {
2972 $token['name'] = $svg_lookup[$token['name']];
2974 if ($current->namespaceURI === self::NS_SVG) {
2975 $token = $this->adjustSVGAttributes($token);
2977 $token = $this->adjustForeignAttributes($token);
2978 $this->insertForeignElement($token, $current->namespaceURI);
2979 if (isset($token['self-closing'])) {
2980 array_pop($this->stack);
2981 // XERROR: acknowledge self-closing flag
2984 break;
2986 case self::AFTER_BODY:
2987 /* Handle the token as follows: */
2989 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2990 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2991 or U+0020 SPACE */
2992 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
2993 /* Process the token as it would be processed if the insertion mode
2994 was "in body". */
2995 $this->processWithRulesFor($token, self::IN_BODY);
2997 /* A comment token */
2998 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
2999 /* Append a Comment node to the first element in the stack of open
3000 elements (the html element), with the data attribute set to the
3001 data given in the comment token. */
3002 // XDOM
3003 $comment = $this->dom->createComment($token['data']);
3004 $this->stack[0]->appendChild($comment);
3006 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
3007 // parse error
3009 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
3010 $this->processWithRulesFor($token, self::IN_BODY);
3012 /* An end tag with the tag name "html" */
3013 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG && $token['name'] === 'html') {
3014 /* If the parser was originally created as part of the HTML
3015 * fragment parsing algorithm, this is a parse error; ignore
3016 * the token. (fragment case) */
3017 $this->ignored = true;
3018 // XERROR: implement this
3020 $this->mode = self::AFTER_AFTER_BODY;
3022 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
3023 /* Stop parsing */
3025 /* Anything else */
3026 } else {
3027 /* Parse error. Set the insertion mode to "in body" and reprocess
3028 the token. */
3029 $this->mode = self::IN_BODY;
3030 $this->emitToken($token);
3032 break;
3034 case self::IN_FRAMESET:
3035 /* Handle the token as follows: */
3037 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3038 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3039 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3040 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
3041 /* Append the character to the current node. */
3042 $this->insertText($token['data']);
3044 /* A comment token */
3045 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
3046 /* Append a Comment node to the current node with the data
3047 attribute set to the data given in the comment token. */
3048 $this->insertComment($token['data']);
3050 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
3051 // parse error
3053 /* A start tag with the tag name "frameset" */
3054 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
3055 $token['name'] === 'frameset') {
3056 $this->insertElement($token);
3058 /* An end tag with the tag name "frameset" */
3059 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
3060 $token['name'] === 'frameset') {
3061 /* If the current node is the root html element, then this is a
3062 parse error; ignore the token. (fragment case) */
3063 if (end($this->stack)->tagName === 'html') {
3064 $this->ignored = true;
3065 // Parse error
3067 } else {
3068 /* Otherwise, pop the current node from the stack of open
3069 elements. */
3070 array_pop($this->stack);
3072 /* If the parser was not originally created as part of the HTML
3073 * fragment parsing algorithm (fragment case), and the current
3074 * node is no longer a frameset element, then switch the
3075 * insertion mode to "after frameset". */
3076 $this->mode = self::AFTER_FRAMESET;
3079 /* A start tag with the tag name "frame" */
3080 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
3081 $token['name'] === 'frame') {
3082 /* Insert an HTML element for the token. */
3083 $this->insertElement($token);
3085 /* Immediately pop the current node off the stack of open elements. */
3086 array_pop($this->stack);
3088 // XERROR: Acknowledge the token's self-closing flag, if it is set.
3090 /* A start tag with the tag name "noframes" */
3091 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
3092 $token['name'] === 'noframes') {
3093 /* Process the token using the rules for the "in head" insertion mode. */
3094 $this->processwithRulesFor($token, self::IN_HEAD);
3096 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
3097 // XERROR: If the current node is not the root html element, then this is a parse error.
3098 /* Stop parsing */
3099 /* Anything else */
3100 } else {
3101 /* Parse error. Ignore the token. */
3102 $this->ignored = true;
3104 break;
3106 case self::AFTER_FRAMESET:
3107 /* Handle the token as follows: */
3109 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3110 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3111 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3112 if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
3113 /* Append the character to the current node. */
3114 $this->insertText($token['data']);
3116 /* A comment token */
3117 } elseif ($token['type'] === HTML5_Tokenizer::COMMENT) {
3118 /* Append a Comment node to the current node with the data
3119 attribute set to the data given in the comment token. */
3120 $this->insertComment($token['data']);
3122 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
3123 // parse error
3125 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html') {
3126 $this->processWithRulesFor($token, self::IN_BODY);
3128 /* An end tag with the tag name "html" */
3129 } elseif ($token['type'] === HTML5_Tokenizer::ENDTAG &&
3130 $token['name'] === 'html') {
3131 $this->mode = self::AFTER_AFTER_FRAMESET;
3133 /* A start tag with the tag name "noframes" */
3134 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG &&
3135 $token['name'] === 'noframes') {
3136 $this->processWithRulesFor($token, self::IN_HEAD);
3138 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
3139 /* Stop parsing */
3141 /* Anything else */
3142 } else {
3143 /* Parse error. Ignore the token. */
3144 $this->ignored = true;
3146 break;
3148 case self::AFTER_AFTER_BODY:
3149 /* A comment token */
3150 if ($token['type'] === HTML5_Tokenizer::COMMENT) {
3151 /* Append a Comment node to the Document object with the data
3152 attribute set to the data given in the comment token. */
3153 // XDOM
3154 $comment = $this->dom->createComment($token['data']);
3155 $this->dom->appendChild($comment);
3157 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE ||
3158 $token['type'] === HTML5_Tokenizer::SPACECHARACTER ||
3159 ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html')) {
3160 $this->processWithRulesFor($token, self::IN_BODY);
3162 /* An end-of-file token */
3163 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
3164 /* OMG DONE!! */
3165 } else {
3166 // parse error
3167 $this->mode = self::IN_BODY;
3168 $this->emitToken($token);
3170 break;
3172 case self::AFTER_AFTER_FRAMESET:
3173 /* A comment token */
3174 if ($token['type'] === HTML5_Tokenizer::COMMENT) {
3175 /* Append a Comment node to the Document object with the data
3176 attribute set to the data given in the comment token. */
3177 // XDOM
3178 $comment = $this->dom->createComment($token['data']);
3179 $this->dom->appendChild($comment);
3180 } elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE ||
3181 $token['type'] === HTML5_Tokenizer::SPACECHARACTER ||
3182 ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'html')) {
3183 $this->processWithRulesFor($token, self::IN_BODY);
3185 /* An end-of-file token */
3186 } elseif ($token['type'] === HTML5_Tokenizer::EOF) {
3187 /* OMG DONE!! */
3188 } elseif ($token['type'] === HTML5_Tokenizer::STARTTAG && $token['name'] === 'nofrmaes') {
3189 $this->processWithRulesFor($token, self::IN_HEAD);
3190 } else {
3191 // parse error
3193 break;
3197 private function insertElement($token, $append = true) {
3198 $el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
3200 if (!empty($token['attr'])) {
3201 foreach ($token['attr'] as $attr) {
3202 if (!$el->hasAttribute($attr['name']) && preg_match("/^[a-zA-Z_:]/", $attr['name'])) {
3203 $el->setAttribute($attr['name'], $attr['value']);
3207 if ($append) {
3208 $this->appendToRealParent($el);
3209 $this->stack[] = $el;
3212 return $el;
3216 * @param $data
3218 private function insertText($data) {
3219 if ($data === '') {
3220 return;
3222 if ($this->ignore_lf_token) {
3223 if ($data[0] === "\n") {
3224 $data = substr($data, 1);
3225 if ($data === false) {
3226 return;
3230 $text = $this->dom->createTextNode($data);
3231 $this->appendToRealParent($text);
3235 * @param $data
3237 private function insertComment($data) {
3238 $comment = $this->dom->createComment($data);
3239 $this->appendToRealParent($comment);
3243 * @param $node
3245 private function appendToRealParent($node) {
3246 // this is only for the foster_parent case
3247 /* If the current node is a table, tbody, tfoot, thead, or tr
3248 element, then, whenever a node would be inserted into the current
3249 node, it must instead be inserted into the foster parent element. */
3250 if (
3251 !$this->foster_parent ||
3252 !in_array(
3253 end($this->stack)->tagName,
3254 array('table', 'tbody', 'tfoot', 'thead', 'tr')
3257 end($this->stack)->appendChild($node);
3258 } else {
3259 $this->fosterParent($node);
3264 * @param $el
3265 * @param int $scope
3266 * @return bool|null
3268 private function elementInScope($el, $scope = self::SCOPE) {
3269 if (is_array($el)) {
3270 foreach($el as $element) {
3271 if ($this->elementInScope($element, $scope)) {
3272 return true;
3276 return false;
3279 $leng = count($this->stack);
3281 for ($n = 0; $n < $leng; $n++) {
3282 /* 1. Initialise node to be the current node (the bottommost node of
3283 the stack). */
3284 $node = $this->stack[$leng - 1 - $n];
3286 if ($node->tagName === $el) {
3287 /* 2. If node is the target node, terminate in a match state. */
3288 return true;
3290 // We've expanded the logic for these states a little differently;
3291 // Hixie's refactoring into "specific scope" is more general, but
3292 // this "gets the job done"
3294 // these are the common states for all scopes
3295 } elseif ($node->tagName === 'table' || $node->tagName === 'html') {
3296 return false;
3298 // these are valid for "in scope" and "in list item scope"
3299 } elseif ($scope !== self::SCOPE_TABLE &&
3300 (in_array($node->tagName, array('applet', 'caption', 'td',
3301 'th', 'button', 'marquee', 'object')) ||
3302 $node->tagName === 'foreignObject' && $node->namespaceURI === self::NS_SVG)) {
3303 return false;
3306 // these are valid for "in list item scope"
3307 } elseif ($scope === self::SCOPE_LISTITEM && in_array($node->tagName, array('ol', 'ul'))) {
3308 return false;
3311 /* Otherwise, set node to the previous entry in the stack of open
3312 elements and return to step 2. (This will never fail, since the loop
3313 will always terminate in the previous step if the top of the stack
3314 is reached.) */
3317 // To fix warning. This never happens or should return true/false
3318 return null;
3322 * @return bool
3324 private function reconstructActiveFormattingElements() {
3325 /* 1. If there are no entries in the list of active formatting elements,
3326 then there is nothing to reconstruct; stop this algorithm. */
3327 $formatting_elements = count($this->a_formatting);
3329 if ($formatting_elements === 0) {
3330 return false;
3333 /* 3. Let entry be the last (most recently added) element in the list
3334 of active formatting elements. */
3335 $entry = end($this->a_formatting);
3337 /* 2. If the last (most recently added) entry in the list of active
3338 formatting elements is a marker, or if it is an element that is in the
3339 stack of open elements, then there is nothing to reconstruct; stop this
3340 algorithm. */
3341 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3342 return false;
3345 for ($a = $formatting_elements - 1; $a >= 0; true) {
3346 /* 4. If there are no entries before entry in the list of active
3347 formatting elements, then jump to step 8. */
3348 if ($a === 0) {
3349 $step_seven = false;
3350 break;
3353 /* 5. Let entry be the entry one earlier than entry in the list of
3354 active formatting elements. */
3355 $a--;
3356 $entry = $this->a_formatting[$a];
3358 /* 6. If entry is neither a marker nor an element that is also in
3359 thetack of open elements, go to step 4. */
3360 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3361 break;
3365 while (true) {
3366 /* 7. Let entry be the element one later than entry in the list of
3367 active formatting elements. */
3368 if (isset($step_seven) && $step_seven === true) {
3369 $a++;
3370 $entry = $this->a_formatting[$a];
3373 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3374 $clone = $entry->cloneNode();
3376 /* 9. Append clone to the current node and push it onto the stack
3377 of open elements so that it is the new current node. */
3378 $this->appendToRealParent($clone);
3379 $this->stack[] = $clone;
3381 /* 10. Replace the entry for entry in the list with an entry for
3382 clone. */
3383 $this->a_formatting[$a] = $clone;
3385 /* 11. If the entry for clone in the list of active formatting
3386 elements is not the last entry in the list, return to step 7. */
3387 if (end($this->a_formatting) !== $clone) {
3388 $step_seven = true;
3389 } else {
3390 break;
3394 // Return value not in use ATM. Would just make sense to also return true here.
3395 return true;
3401 private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3402 /* When the steps below require the UA to clear the list of active
3403 formatting elements up to the last marker, the UA must perform the
3404 following steps: */
3406 while (true) {
3407 /* 1. Let entry be the last (most recently added) entry in the list
3408 of active formatting elements. */
3409 $entry = end($this->a_formatting);
3411 /* 2. Remove entry from the list of active formatting elements. */
3412 array_pop($this->a_formatting);
3414 /* 3. If entry was a marker, then stop the algorithm at this point.
3415 The list has been cleared up to the last marker. */
3416 if ($entry === self::MARKER) {
3417 break;
3423 * @param array $exclude
3425 private function generateImpliedEndTags($exclude = array()) {
3426 /* When the steps below require the UA to generate implied end tags,
3427 * then, while the current node is a dc element, a dd element, a ds
3428 * element, a dt element, an li element, an option element, an optgroup
3429 * element, a p element, an rp element, or an rt element, the UA must
3430 * pop the current node off the stack of open elements. */
3431 $node = end($this->stack);
3432 $elements = array_diff(array('dc', 'dd', 'ds', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3434 while (in_array(end($this->stack)->tagName, $elements)) {
3435 array_pop($this->stack);
3440 * @param $node
3441 * @return int
3443 private function getElementCategory($node) {
3444 if (!is_object($node)) {
3445 debug_print_backtrace();
3447 $name = $node->tagName;
3448 if (in_array($name, $this->special)) {
3449 return self::SPECIAL;
3450 } elseif (in_array($name, $this->scoping)) {
3451 return self::SCOPING;
3452 } elseif (in_array($name, $this->formatting)) {
3453 return self::FORMATTING;
3454 } else {
3455 return self::PHRASING;
3460 * @param $elements
3462 private function clearStackToTableContext($elements) {
3463 /* When the steps above require the UA to clear the stack back to a
3464 table context, it means that the UA must, while the current node is not
3465 a table element or an html element, pop elements from the stack of open
3466 elements. */
3467 while (true) {
3468 $name = end($this->stack)->tagName;
3470 if (in_array($name, $elements)) {
3471 break;
3472 } else {
3473 array_pop($this->stack);
3479 * @param null $context
3481 private function resetInsertionMode($context = null) {
3482 /* 1. Let last be false. */
3483 $last = false;
3484 $leng = count($this->stack);
3486 for ($n = $leng - 1; $n >= 0; $n--) {
3487 /* 2. Let node be the last node in the stack of open elements. */
3488 $node = $this->stack[$n];
3490 /* 3. If node is the first node in the stack of open elements, then
3491 * set last to true and set node to the context element. (fragment
3492 * case) */
3493 if ($this->stack[0]->isSameNode($node)) {
3494 $last = true;
3495 $node = $context;
3498 /* 4. If node is a select element, then switch the insertion mode to
3499 "in select" and abort these steps. (fragment case) */
3500 if ($node->tagName === 'select') {
3501 $this->mode = self::IN_SELECT;
3502 break;
3504 /* 5. If node is a td or th element, then switch the insertion mode
3505 to "in cell" and abort these steps. */
3506 } elseif ($node->tagName === 'td' || $node->nodeName === 'th') {
3507 $this->mode = self::IN_CELL;
3508 break;
3510 /* 6. If node is a tr element, then switch the insertion mode to
3511 "in row" and abort these steps. */
3512 } elseif ($node->tagName === 'tr') {
3513 $this->mode = self::IN_ROW;
3514 break;
3516 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3517 insertion mode to "in table body" and abort these steps. */
3518 } elseif (in_array($node->tagName, array('tbody', 'thead', 'tfoot'))) {
3519 $this->mode = self::IN_TABLE_BODY;
3520 break;
3522 /* 8. If node is a caption element, then switch the insertion mode
3523 to "in caption" and abort these steps. */
3524 } elseif ($node->tagName === 'caption') {
3525 $this->mode = self::IN_CAPTION;
3526 break;
3528 /* 9. If node is a colgroup element, then switch the insertion mode
3529 to "in column group" and abort these steps. (innerHTML case) */
3530 } elseif ($node->tagName === 'colgroup') {
3531 $this->mode = self::IN_COLUMN_GROUP;
3532 break;
3534 /* 10. If node is a table element, then switch the insertion mode
3535 to "in table" and abort these steps. */
3536 } elseif ($node->tagName === 'table') {
3537 $this->mode = self::IN_TABLE;
3538 break;
3540 /* 11. If node is an element from the MathML namespace or the SVG
3541 * namespace, then switch the insertion mode to "in foreign
3542 * content", let the secondary insertion mode be "in body", and
3543 * abort these steps. */
3544 } elseif ($node->namespaceURI === self::NS_SVG ||
3545 $node->namespaceURI === self::NS_MATHML) {
3546 $this->mode = self::IN_FOREIGN_CONTENT;
3547 $this->secondary_mode = self::IN_BODY;
3548 break;
3550 /* 12. If node is a head element, then switch the insertion mode
3551 to "in body" ("in body"! not "in head"!) and abort these steps.
3552 (fragment case) */
3553 } elseif ($node->tagName === 'head') {
3554 $this->mode = self::IN_BODY;
3555 break;
3557 /* 13. If node is a body element, then switch the insertion mode to
3558 "in body" and abort these steps. */
3559 } elseif ($node->tagName === 'body') {
3560 $this->mode = self::IN_BODY;
3561 break;
3563 /* 14. If node is a frameset element, then switch the insertion
3564 mode to "in frameset" and abort these steps. (fragment case) */
3565 } elseif ($node->tagName === 'frameset') {
3566 $this->mode = self::IN_FRAMESET;
3567 break;
3569 /* 15. If node is an html element, then: if the head element
3570 pointer is null, switch the insertion mode to "before head",
3571 otherwise, switch the insertion mode to "after head". In either
3572 case, abort these steps. (fragment case) */
3573 } elseif ($node->tagName === 'html') {
3574 $this->mode = ($this->head_pointer === null)
3575 ? self::BEFORE_HEAD
3576 : self::AFTER_HEAD;
3578 break;
3580 /* 16. If last is true, then set the insertion mode to "in body"
3581 and abort these steps. (fragment case) */
3582 } elseif ($last) {
3583 $this->mode = self::IN_BODY;
3584 break;
3592 private function closeCell() {
3593 /* If the stack of open elements has a td or th element in table scope,
3594 then act as if an end tag token with that tag name had been seen. */
3595 foreach (array('td', 'th') as $cell) {
3596 if ($this->elementInScope($cell, self::SCOPE_TABLE)) {
3597 $this->emitToken(array(
3598 'name' => $cell,
3599 'type' => HTML5_Tokenizer::ENDTAG
3602 break;
3608 * @param $token
3609 * @param $mode
3611 private function processWithRulesFor($token, $mode) {
3612 /* "using the rules for the m insertion mode", where m is one of these
3613 * modes, the user agent must use the rules described under the m
3614 * insertion mode's section, but must leave the insertion mode
3615 * unchanged unless the rules in m themselves switch the insertion mode
3616 * to a new value. */
3617 $this->emitToken($token, $mode);
3621 * @param $token
3623 private function insertCDATAElement($token) {
3624 $this->insertElement($token);
3625 $this->original_mode = $this->mode;
3626 $this->mode = self::IN_CDATA_RCDATA;
3627 $this->content_model = HTML5_Tokenizer::CDATA;
3631 * @param $token
3633 private function insertRCDATAElement($token) {
3634 $this->insertElement($token);
3635 $this->original_mode = $this->mode;
3636 $this->mode = self::IN_CDATA_RCDATA;
3637 $this->content_model = HTML5_Tokenizer::RCDATA;
3641 * @param $token
3642 * @param $key
3643 * @return bool
3645 private function getAttr($token, $key) {
3646 if (!isset($token['attr'])) {
3647 return false;
3649 $ret = false;
3650 foreach ($token['attr'] as $keypair) {
3651 if ($keypair['name'] === $key) {
3652 $ret = $keypair['value'];
3655 return $ret;
3659 * @return mixed
3661 private function getCurrentTable() {
3662 /* The current table is the last table element in the stack of open
3663 * elements, if there is one. If there is no table element in the stack
3664 * of open elements (fragment case), then the current table is the
3665 * first element in the stack of open elements (the html element). */
3666 for ($i = count($this->stack) - 1; $i >= 0; $i--) {
3667 if ($this->stack[$i]->tagName === 'table') {
3668 return $this->stack[$i];
3671 return $this->stack[0];
3675 * @return mixed
3677 private function getFosterParent() {
3678 /* The foster parent element is the parent element of the last
3679 table element in the stack of open elements, if there is a
3680 table element and it has such a parent element. If there is no
3681 table element in the stack of open elements (innerHTML case),
3682 then the foster parent element is the first element in the
3683 stack of open elements (the html element). Otherwise, if there
3684 is a table element in the stack of open elements, but the last
3685 table element in the stack of open elements has no parent, or
3686 its parent node is not an element, then the foster parent
3687 element is the element before the last table element in the
3688 stack of open elements. */
3689 for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3690 if ($this->stack[$n]->tagName === 'table') {
3691 $table = $this->stack[$n];
3692 break;
3696 if (isset($table) && $table->parentNode !== null) {
3697 return $table->parentNode;
3699 } elseif (!isset($table)) {
3700 return $this->stack[0];
3702 } elseif (isset($table) && ($table->parentNode === null ||
3703 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
3704 return $this->stack[$n - 1];
3707 return null;
3711 * @param $node
3713 public function fosterParent($node) {
3714 $foster_parent = $this->getFosterParent();
3715 $table = $this->getCurrentTable(); // almost equivalent to last table element, except it can be html
3716 /* When a node node is to be foster parented, the node node must be
3717 * be inserted into the foster parent element. */
3718 /* If the foster parent element is the parent element of the last table
3719 * element in the stack of open elements, then node must be inserted
3720 * immediately before the last table element in the stack of open
3721 * elements in the foster parent element; otherwise, node must be
3722 * appended to the foster parent element. */
3723 if ($table->tagName === 'table' && $table->parentNode->isSameNode($foster_parent)) {
3724 $foster_parent->insertBefore($node, $table);
3725 } else {
3726 $foster_parent->appendChild($node);
3731 * For debugging, prints the stack
3733 private function printStack() {
3734 $names = array();
3735 foreach ($this->stack as $i => $element) {
3736 $names[] = $element->tagName;
3738 echo " -> stack [" . implode(', ', $names) . "]\n";
3742 * For debugging, prints active formatting elements
3744 private function printActiveFormattingElements() {
3745 if (!$this->a_formatting) {
3746 return;
3748 $names = array();
3749 foreach ($this->a_formatting as $node) {
3750 if ($node === self::MARKER) {
3751 $names[] = 'MARKER';
3752 } else {
3753 $names[] = $node->tagName;
3756 echo " -> active formatting [" . implode(', ', $names) . "]\n";
3760 * @return bool
3762 public function currentTableIsTainted() {
3763 return !empty($this->getCurrentTable()->tainted);
3767 * Sets up the tree constructor for building a fragment.
3769 * @param null $context
3771 public function setupContext($context = null) {
3772 $this->fragment = true;
3773 if ($context) {
3774 $context = $this->dom->createElementNS(self::NS_HTML, $context);
3775 /* 4.1. Set the HTML parser's tokenization stage's content model
3776 * flag according to the context element, as follows: */
3777 switch ($context->tagName) {
3778 case 'title': case 'textarea':
3779 $this->content_model = HTML5_Tokenizer::RCDATA;
3780 break;
3781 case 'style': case 'script': case 'xmp': case 'iframe':
3782 case 'noembed': case 'noframes':
3783 $this->content_model = HTML5_Tokenizer::CDATA;
3784 break;
3785 case 'noscript':
3786 // XSCRIPT: assuming scripting is enabled
3787 $this->content_model = HTML5_Tokenizer::CDATA;
3788 break;
3789 case 'plaintext':
3790 $this->content_model = HTML5_Tokenizer::PLAINTEXT;
3791 break;
3793 /* 4.2. Let root be a new html element with no attributes. */
3794 $root = $this->dom->createElementNS(self::NS_HTML, 'html');
3795 $this->root = $root;
3796 /* 4.3 Append the element root to the Document node created above. */
3797 $this->dom->appendChild($root);
3798 /* 4.4 Set up the parser's stack of open elements so that it
3799 * contains just the single element root. */
3800 $this->stack = array($root);
3801 /* 4.5 Reset the parser's insertion mode appropriately. */
3802 $this->resetInsertionMode($context);
3803 /* 4.6 Set the parser's form element pointer to the nearest node
3804 * to the context element that is a form element (going straight up
3805 * the ancestor chain, and including the element itself, if it is a
3806 * form element), or, if there is no such form element, to null. */
3807 $node = $context;
3808 do {
3809 if ($node->tagName === 'form') {
3810 $this->form_pointer = $node;
3811 break;
3813 } while ($node = $node->parentNode);
3818 * @param $token
3819 * @return mixed
3821 public function adjustMathMLAttributes($token) {
3822 foreach ($token['attr'] as &$kp) {
3823 if ($kp['name'] === 'definitionurl') {
3824 $kp['name'] = 'definitionURL';
3827 return $token;
3831 * @param $token
3832 * @return mixed
3834 public function adjustSVGAttributes($token) {
3835 static $lookup = array(
3836 'attributename' => 'attributeName',
3837 'attributetype' => 'attributeType',
3838 'basefrequency' => 'baseFrequency',
3839 'baseprofile' => 'baseProfile',
3840 'calcmode' => 'calcMode',
3841 'clippathunits' => 'clipPathUnits',
3842 'contentscripttype' => 'contentScriptType',
3843 'contentstyletype' => 'contentStyleType',
3844 'diffuseconstant' => 'diffuseConstant',
3845 'edgemode' => 'edgeMode',
3846 'externalresourcesrequired' => 'externalResourcesRequired',
3847 'filterres' => 'filterRes',
3848 'filterunits' => 'filterUnits',
3849 'glyphref' => 'glyphRef',
3850 'gradienttransform' => 'gradientTransform',
3851 'gradientunits' => 'gradientUnits',
3852 'kernelmatrix' => 'kernelMatrix',
3853 'kernelunitlength' => 'kernelUnitLength',
3854 'keypoints' => 'keyPoints',
3855 'keysplines' => 'keySplines',
3856 'keytimes' => 'keyTimes',
3857 'lengthadjust' => 'lengthAdjust',
3858 'limitingconeangle' => 'limitingConeAngle',
3859 'markerheight' => 'markerHeight',
3860 'markerunits' => 'markerUnits',
3861 'markerwidth' => 'markerWidth',
3862 'maskcontentunits' => 'maskContentUnits',
3863 'maskunits' => 'maskUnits',
3864 'numoctaves' => 'numOctaves',
3865 'pathlength' => 'pathLength',
3866 'patterncontentunits' => 'patternContentUnits',
3867 'patterntransform' => 'patternTransform',
3868 'patternunits' => 'patternUnits',
3869 'pointsatx' => 'pointsAtX',
3870 'pointsaty' => 'pointsAtY',
3871 'pointsatz' => 'pointsAtZ',
3872 'preservealpha' => 'preserveAlpha',
3873 'preserveaspectratio' => 'preserveAspectRatio',
3874 'primitiveunits' => 'primitiveUnits',
3875 'refx' => 'refX',
3876 'refy' => 'refY',
3877 'repeatcount' => 'repeatCount',
3878 'repeatdur' => 'repeatDur',
3879 'requiredextensions' => 'requiredExtensions',
3880 'requiredfeatures' => 'requiredFeatures',
3881 'specularconstant' => 'specularConstant',
3882 'specularexponent' => 'specularExponent',
3883 'spreadmethod' => 'spreadMethod',
3884 'startoffset' => 'startOffset',
3885 'stddeviation' => 'stdDeviation',
3886 'stitchtiles' => 'stitchTiles',
3887 'surfacescale' => 'surfaceScale',
3888 'systemlanguage' => 'systemLanguage',
3889 'tablevalues' => 'tableValues',
3890 'targetx' => 'targetX',
3891 'targety' => 'targetY',
3892 'textlength' => 'textLength',
3893 'viewbox' => 'viewBox',
3894 'viewtarget' => 'viewTarget',
3895 'xchannelselector' => 'xChannelSelector',
3896 'ychannelselector' => 'yChannelSelector',
3897 'zoomandpan' => 'zoomAndPan',
3899 foreach ($token['attr'] as &$kp) {
3900 if (isset($lookup[$kp['name']])) {
3901 $kp['name'] = $lookup[$kp['name']];
3904 return $token;
3908 * @param $token
3909 * @return mixed
3911 public function adjustForeignAttributes($token) {
3912 static $lookup = array(
3913 'xlink:actuate' => array('xlink', 'actuate', self::NS_XLINK),
3914 'xlink:arcrole' => array('xlink', 'arcrole', self::NS_XLINK),
3915 'xlink:href' => array('xlink', 'href', self::NS_XLINK),
3916 'xlink:role' => array('xlink', 'role', self::NS_XLINK),
3917 'xlink:show' => array('xlink', 'show', self::NS_XLINK),
3918 'xlink:title' => array('xlink', 'title', self::NS_XLINK),
3919 'xlink:type' => array('xlink', 'type', self::NS_XLINK),
3920 'xml:base' => array('xml', 'base', self::NS_XML),
3921 'xml:lang' => array('xml', 'lang', self::NS_XML),
3922 'xml:space' => array('xml', 'space', self::NS_XML),
3923 'xmlns' => array(null, 'xmlns', self::NS_XMLNS),
3924 'xmlns:xlink' => array('xmlns', 'xlink', self::NS_XMLNS),
3926 foreach ($token['attr'] as &$kp) {
3927 if (isset($lookup[$kp['name']])) {
3928 $kp['name'] = $lookup[$kp['name']];
3931 return $token;
3935 * @param $token
3936 * @param $namespaceURI
3938 public function insertForeignElement($token, $namespaceURI) {
3939 $el = $this->dom->createElementNS($namespaceURI, $token['name']);
3941 if (!empty($token['attr'])) {
3942 foreach ($token['attr'] as $kp) {
3943 $attr = $kp['name'];
3944 if (is_array($attr)) {
3945 $ns = $attr[2];
3946 $attr = $attr[1];
3947 } else {
3948 $ns = self::NS_HTML;
3950 if (!$el->hasAttributeNS($ns, $attr)) {
3951 // XSKETCHY: work around godawful libxml bug
3952 if ($ns === self::NS_XLINK) {
3953 $el->setAttribute('xlink:'.$attr, $kp['value']);
3954 } elseif ($ns === self::NS_HTML) {
3955 // Another godawful libxml bug
3956 $el->setAttribute($attr, $kp['value']);
3957 } else {
3958 $el->setAttributeNS($ns, $attr, $kp['value']);
3963 $this->appendToRealParent($el);
3964 $this->stack[] = $el;
3965 // XERROR: see below
3966 /* If the newly created element has an xmlns attribute in the XMLNS
3967 * namespace whose value is not exactly the same as the element's
3968 * namespace, that is a parse error. Similarly, if the newly created
3969 * element has an xmlns:xlink attribute in the XMLNS namespace whose
3970 * value is not the XLink Namespace, that is a parse error. */
3974 * @return DOMDocument|DOMNodeList
3976 public function save() {
3977 $this->dom->normalize();
3978 if (!$this->fragment) {
3979 return $this->dom;
3980 } else {
3981 if ($this->root) {
3982 return $this->root->childNodes;
3983 } else {
3984 return $this->dom->childNodes;