From 0767bbc12dbbc8a9c31cc235f055443257ffa51e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@mit.edu>
Date: Sun, 20 Oct 2013 22:18:59 -0700
Subject: [PATCH] Rewrite FixNesting implementation to be tree-based.

This mega-patch rips out the FixNesting implementation and the related
ChildDef components.  The primary algorithmic change is to convert from
use of tokens to tree nodes, which are far more amenable to the style
of processing that FixNesting uses.  Additionally, FixNesting has been
changed to go bottom-up rather than top-down, in order to avoid needing
to implement backtracking.

This patch simplifies a good deal of the relevant logic, since we no
longer need to continually recalculate the nesting structure when
processing things.  However, the conversion to the alternate format
incurs some overhead, so for small inputs these changes are not a win.
One possibility to greatly reduce the constant factors here is to switch
to entirely using libxml's representation, and never serializing tokens;
this would require one to rewrite injectors, however.

The iterative post-order traversal in FixNesting is a bit subtle, but
we have essentially reified the stack and continuations.

We've removed support for %Core.EscapeInvalidChildren.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
---
 NEWS                                               |   5 +
 configdoc/usage.xml                                |   7 +-
 library/HTMLPurifier/ChildDef.php                  |   8 +-
 library/HTMLPurifier/ChildDef/Chameleon.php        |   8 +-
 library/HTMLPurifier/ChildDef/Custom.php           |  21 +-
 library/HTMLPurifier/ChildDef/Empty.php            |   4 +-
 library/HTMLPurifier/ChildDef/List.php             | 229 ++++-----
 library/HTMLPurifier/ChildDef/Optional.php         |  12 +-
 library/HTMLPurifier/ChildDef/Required.php         |  71 +--
 library/HTMLPurifier/ChildDef/StrictBlockquote.php |  56 +--
 library/HTMLPurifier/ChildDef/Table.php            | 514 +++++++++----------
 .../schema/Core.EscapeInvalidChildren.txt          |   6 +-
 library/HTMLPurifier/Node.php                      |   9 +
 library/HTMLPurifier/Node/Text.php                 |   7 +
 library/HTMLPurifier/Strategy/FixNesting.php       | 544 +++++++--------------
 tests/HTMLPurifier/ChildDef/CustomTest.php         |   5 +
 tests/HTMLPurifier/ChildDef/ListTest.php           |   6 -
 tests/HTMLPurifier/ChildDef/RequiredTest.php       |   9 +-
 tests/HTMLPurifier/ChildDef/TableTest.php          |   2 +-
 tests/HTMLPurifier/ChildDefHarness.php             |   2 +-
 tests/HTMLPurifier/ComplexHarness.php              |  33 +-
 tests/HTMLPurifier/Strategy/FixNestingTest.php     |  18 -
 22 files changed, 618 insertions(+), 958 deletions(-)
 rewrite library/HTMLPurifier/ChildDef/List.php (77%)
 rewrite library/HTMLPurifier/ChildDef/Table.php (75%)
 rewrite library/HTMLPurifier/Strategy/FixNesting.php (78%)

diff --git a/NEWS b/NEWS
index e4a92264..e6f5c973 100644
--- a/NEWS
+++ b/NEWS
@@ -15,10 +15,15 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 # URI parsing algorithm was made more strict, so only prefixes which
   looks like schemes will actually be schemes.  Thanks
   Michael Gusev <mgusev@sugarcrm.com> for fixing.
+# %Core.EscapeInvalidChildren is no longer supported, and no longer does
+  anything.
 ! New directive %Core.AllowHostnameUnderscore which allows underscores
   in hostnames.
 - Eliminate quadratic behavior in DOMLex by using a proper queue.
   Thanks Ole Laursen for noticing this.
+- Rewritten MakeWellFormed/FixNesting implementation eliminates quadratic
+  behavior in the rest of the purificaiton pipeline.  Thanks Chedburn
+  Networks for sponsoring this work.
 - Made Linkify URL parser a bit less permissive, so that non-breaking
   spaces and commas are not included as part of URL.  Thanks nAS for fixing.
 - Fix some bad interactions with %HTML.Allowed and injectors.  Thanks
diff --git a/configdoc/usage.xml b/configdoc/usage.xml
index 56eeafee..f3f7a36a 100644
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@@ -406,11 +406,6 @@
    <line>53</line>
   </file>
  </directive>
- <directive id="Core.EscapeInvalidChildren">
-  <file name="HTMLPurifier/ChildDef/Required.php">
-   <line>86</line>
-  </file>
- </directive>
  <directive id="Cache.SerializerPath">
   <file name="HTMLPurifier/DefinitionCache/Serializer.php">
    <line>171</line>
@@ -498,7 +493,7 @@
  </directive>
  <directive id="Core.DisableExcludes">
   <file name="HTMLPurifier/Strategy/FixNesting.php">
-   <line>67</line>
+   <line>54</line>
   </file>
  </directive>
  <directive id="Core.EscapeInvalidTags">
diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php
index 2c0f1647..8eb17b82 100644
--- a/library/HTMLPurifier/ChildDef.php
+++ b/library/HTMLPurifier/ChildDef.php
@@ -1,7 +1,7 @@
 <?php
 
 /**
- * Defines allowed child nodes and validates tokens against it.
+ * Defines allowed child nodes and validates nodes against it.
  */
 abstract class HTMLPurifier_ChildDef
 {
@@ -41,12 +41,12 @@ abstract class HTMLPurifier_ChildDef
     /**
      * Validates nodes according to definition and returns modification.
      *
-     * @param HTMLPurifier_Token[] $tokens_of_children Array of HTMLPurifier_Token
+     * @param HTMLPurifier_Node[] $children Array of HTMLPurifier_Node
      * @param HTMLPurifier_Config $config HTMLPurifier_Config object
      * @param HTMLPurifier_Context $context HTMLPurifier_Context object
-     * @return bool|array true to leave nodes as is, false to remove parent node, array of replacement child tokens
+     * @return bool|array true to leave nodes as is, false to remove parent node, array of replacement children
      */
-    abstract public function validateChildren($tokens_of_children, $config, $context);
+    abstract public function validateChildren($children, $config, $context);
 }
 
 // vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php
index f7508739..7439be26 100644
--- a/library/HTMLPurifier/ChildDef/Chameleon.php
+++ b/library/HTMLPurifier/ChildDef/Chameleon.php
@@ -41,22 +41,22 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
     }
 
     /**
-     * @param array $tokens_of_children
+     * @param HTMLPurifier_Node[] $children
      * @param HTMLPurifier_Config $config
      * @param HTMLPurifier_Context $context
      * @return bool
      */
-    public function validateChildren($tokens_of_children, $config, $context)
+    public function validateChildren($children, $config, $context)
     {
         if ($context->get('IsInline') === false) {
             return $this->block->validateChildren(
-                $tokens_of_children,
+                $children,
                 $config,
                 $context
             );
         } else {
             return $this->inline->validateChildren(
-                $tokens_of_children,
+                $children,
                 $config,
                 $context
             );
diff --git a/library/HTMLPurifier/ChildDef/Custom.php b/library/HTMLPurifier/ChildDef/Custom.php
index 06193dc2..128132e9 100644
--- a/library/HTMLPurifier/ChildDef/Custom.php
+++ b/library/HTMLPurifier/ChildDef/Custom.php
@@ -73,31 +73,20 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
     }
 
     /**
-     * @param array $tokens_of_children
+     * @param HTMLPurifier_Node[] $children
      * @param HTMLPurifier_Config $config
      * @param HTMLPurifier_Context $context
      * @return bool
      */
-    public function validateChildren($tokens_of_children, $config, $context)
+    public function validateChildren($children, $config, $context)
     {
         $list_of_children = '';
         $nesting = 0; // depth into the nest
-        foreach ($tokens_of_children as $token) {
-            if (!empty($token->is_whitespace)) {
+        foreach ($children as $node) {
+            if (!empty($node->is_whitespace)) {
                 continue;
             }
-
-            $is_child = ($nesting == 0); // direct
-
-            if ($token instanceof HTMLPurifier_Token_Start) {
-                $nesting++;
-            } elseif ($token instanceof HTMLPurifier_Token_End) {
-                $nesting--;
-            }
-
-            if ($is_child) {
-                $list_of_children .= $token->name . ',';
-            }
+            $list_of_children .= $node->name . ',';
         }
         // add leading comma to deal with stray comma declarations
         $list_of_children = ',' . rtrim($list_of_children, ',');
diff --git a/library/HTMLPurifier/ChildDef/Empty.php b/library/HTMLPurifier/ChildDef/Empty.php
index a6e0ea55..a8a6cbdd 100644
--- a/library/HTMLPurifier/ChildDef/Empty.php
+++ b/library/HTMLPurifier/ChildDef/Empty.php
@@ -24,12 +24,12 @@ class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
     }
 
     /**
-     * @param array $tokens_of_children
+     * @param HTMLPurifier_Node[] $children
      * @param HTMLPurifier_Config $config
      * @param HTMLPurifier_Context $context
      * @return array
      */
-    public function validateChildren($tokens_of_children, $config, $context)
+    public function validateChildren($children, $config, $context)
     {
         return array();
     }
diff --git a/library/HTMLPurifier/ChildDef/List.php b/library/HTMLPurifier/ChildDef/List.php
dissimilarity index 77%
index c9b021b5..891b9f6f 100644
--- a/library/HTMLPurifier/ChildDef/List.php
+++ b/library/HTMLPurifier/ChildDef/List.php
@@ -1,143 +1,86 @@
-<?php
-
-/**
- * Definition for list containers ul and ol.
- */
-class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
-{
-    /**
-     * @type string
-     */
-    public $type = 'list';
-    /**
-     * @type array
-     */
-    // lying a little bit, so that we can handle ul and ol ourselves
-    // XXX: This whole business with 'wrap' is all a bit unsatisfactory
-    public $elements = array('li' => true, 'ul' => true, 'ol' => true);
-
-    /**
-     * @param array $tokens_of_children
-     * @param HTMLPurifier_Config $config
-     * @param HTMLPurifier_Context $context
-     * @return array
-     */
-    public function validateChildren($tokens_of_children, $config, $context)
-    {
-        // Flag for subclasses
-        $this->whitespace = false;
-
-        // if there are no tokens, delete parent node
-        if (empty($tokens_of_children)) {
-            return false;
-        }
-
-        // the new set of children
-        $result = array();
-
-        // current depth into the nest
-        $nesting = 0;
-
-        // a little sanity check to make sure it's not ALL whitespace
-        $all_whitespace = true;
-
-        $seen_li = false;
-        $need_close_li = false;
-
-        foreach ($tokens_of_children as $token) {
-            if (!empty($token->is_whitespace)) {
-                $result[] = $token;
-                continue;
-            }
-            $all_whitespace = false; // phew, we're not talking about whitespace
-
-            if ($nesting == 1 && $need_close_li) {
-                $result[] = new HTMLPurifier_Token_End('li');
-                $nesting--;
-                $need_close_li = false;
-            }
-
-            $is_child = ($nesting == 0);
-
-            if ($token instanceof HTMLPurifier_Token_Start) {
-                $nesting++;
-            } elseif ($token instanceof HTMLPurifier_Token_End) {
-                $nesting--;
-            }
-
-            if ($is_child) {
-                if ($token->name === 'li') {
-                    // good
-                    $seen_li = true;
-                } elseif ($token->name === 'ul' || $token->name === 'ol') {
-                    // we want to tuck this into the previous li
-                    $need_close_li = true;
-                    $nesting++;
-                    if (!$seen_li) {
-                        // create a new li element
-                        $result[] = new HTMLPurifier_Token_Start('li');
-                    } else {
-                        // backtrack until </li> found
-                        while (true) {
-                            $t = array_pop($result);
-                            if ($t instanceof HTMLPurifier_Token_End) {
-                                // XXX actually, these invariants could very plausibly be violated
-                                // if we are doing silly things with modifying the set of allowed elements.
-                                // FORTUNATELY, it doesn't make a difference, since the allowed
-                                // elements are hard-coded here!
-                                if ($t->name !== 'li') {
-                                    trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
-                                    return false;
-                                }
-                                break;
-                            } elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh
-                                if ($t->name !== 'li') {
-                                    trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
-                                    return false;
-                                }
-                                // XXX this should have a helper for it...
-                                $result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor);
-                                break;
-                            } else {
-                                if (!$t->is_whitespace) {
-                                    trigger_error(
-                                        "Only whitespace present invariant violated in List ChildDef",
-                                        E_USER_ERROR
-                                    );
-                                    return false;
-                                }
-                            }
-                        }
-                    }
-                } else {
-                    // start wrapping (this doesn't precisely mimic
-                    // browser behavior, but what browsers do is kind of
-                    // hard to mimic in a standards compliant way
-                    // XXX Actually, this has no impact in practice,
-                    // because this gets handled earlier. Arguably,
-                    // we should rip out all of that processing
-                    $result[] = new HTMLPurifier_Token_Start('li');
-                    $nesting++;
-                    $seen_li = true;
-                    $need_close_li = true;
-                }
-            }
-            $result[] = $token;
-        }
-        if ($need_close_li) {
-            $result[] = new HTMLPurifier_Token_End('li');
-        }
-        if (empty($result)) {
-            return false;
-        }
-        if ($all_whitespace) {
-            return false;
-        }
-        if ($tokens_of_children == $result) {
-            return true;
-        }
-        return $result;
-    }
-}
-
-// vim: et sw=4 sts=4
+<?php
+
+/**
+ * Definition for list containers ul and ol.
+ *
+ * What does this do?  The big thing is to handle ol/ul at the top
+ * level of list nodes, which should be handled specially by /folding/
+ * them into the previous list node.  We generally shouldn't ever
+ * see other disallowed elements, because the autoclose behavior
+ * in MakeWellFormed handles it.
+ */
+class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
+{
+    /**
+     * @type string
+     */
+    public $type = 'list';
+    /**
+     * @type array
+     */
+    // lying a little bit, so that we can handle ul and ol ourselves
+    // XXX: This whole business with 'wrap' is all a bit unsatisfactory
+    public $elements = array('li' => true, 'ul' => true, 'ol' => true);
+
+    /**
+     * @param array $children
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array
+     */
+    public function validateChildren($children, $config, $context)
+    {
+        // Flag for subclasses
+        $this->whitespace = false;
+
+        // if there are no tokens, delete parent node
+        if (empty($children)) {
+            return false;
+        }
+
+        // the new set of children
+        $result = array();
+
+        // a little sanity check to make sure it's not ALL whitespace
+        $all_whitespace = true;
+
+        $current_li = false;
+
+        foreach ($children as $node) {
+            if (!empty($node->is_whitespace)) {
+                $result[] = $node;
+                continue;
+            }
+            $all_whitespace = false; // phew, we're not talking about whitespace
+
+            if ($node->name === 'li') {
+                // good
+                $current_li = $node;
+                $result[] = $node;
+            } else {
+                // we want to tuck this into the previous li
+                // Invariant: we expect the node to be ol/ul
+                // ToDo: Make this more robust in the case of not ol/ul
+                // by distinguishing between existing li and li created
+                // to handle non-list elements; non-list elements should
+                // not be appended to an existing li; only li created
+                // for non-list. This distinction is not currently made.
+                if ($current_li === false) {
+                    $current_li = new HTMLPurifier_Node_Element('li');
+                    $result[] = $current_li;
+                }
+                $current_li->children[] = $node;
+                $current_li->empty = false; // XXX fascinating! Check for this error elsewhere ToDo
+            }
+        }
+        if (empty($result)) {
+            return false;
+        }
+        if ($all_whitespace) {
+            return false;
+        }
+        return $result;
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ChildDef/Optional.php b/library/HTMLPurifier/ChildDef/Optional.php
index 49dad0d1..b9468063 100644
--- a/library/HTMLPurifier/ChildDef/Optional.php
+++ b/library/HTMLPurifier/ChildDef/Optional.php
@@ -20,20 +20,20 @@ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
     public $type = 'optional';
 
     /**
-     * @param array $tokens_of_children
+     * @param array $children
      * @param HTMLPurifier_Config $config
      * @param HTMLPurifier_Context $context
      * @return array
      */
-    public function validateChildren($tokens_of_children, $config, $context)
+    public function validateChildren($children, $config, $context)
     {
-        $result = parent::validateChildren($tokens_of_children, $config, $context);
-        // we assume that $tokens_of_children is not modified
+        $result = parent::validateChildren($children, $config, $context);
+        // we assume that $children is not modified
         if ($result === false) {
-            if (empty($tokens_of_children)) {
+            if (empty($children)) {
                 return true;
             } elseif ($this->whitespace) {
-                return $tokens_of_children;
+                return $children;
             } else {
                 return array();
             }
diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php
index eaa85d4a..0d1c8f5f 100644
--- a/library/HTMLPurifier/ChildDef/Required.php
+++ b/library/HTMLPurifier/ChildDef/Required.php
@@ -50,30 +50,24 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
     public $type = 'required';
 
     /**
-     * @param array $tokens_of_children
+     * @param array $children
      * @param HTMLPurifier_Config $config
      * @param HTMLPurifier_Context $context
      * @return array
      */
-    public function validateChildren($tokens_of_children, $config, $context)
+    public function validateChildren($children, $config, $context)
     {
         // Flag for subclasses
         $this->whitespace = false;
 
         // if there are no tokens, delete parent node
-        if (empty($tokens_of_children)) {
+        if (empty($children)) {
             return false;
         }
 
         // the new set of children
         $result = array();
 
-        // current depth into the nest
-        $nesting = 0;
-
-        // whether or not we're deleting a node
-        $is_deleting = false;
-
         // whether or not parsed character data is allowed
         // this controls whether or not we silently drop a tag
         // or generate escaped HTML from it
@@ -82,51 +76,33 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
         // a little sanity check to make sure it's not ALL whitespace
         $all_whitespace = true;
 
-        // some configuration
-        $escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
-
-        // generator
-        $gen = new HTMLPurifier_Generator($config, $context);
-
-        foreach ($tokens_of_children as $token) {
-            if (!empty($token->is_whitespace)) {
-                $result[] = $token;
+        $stack = array_reverse($children);
+        while (!empty($stack)) {
+            $node = array_pop($stack);
+            if (!empty($node->is_whitespace)) {
+                $result[] = $node;
                 continue;
             }
             $all_whitespace = false; // phew, we're not talking about whitespace
 
-            $is_child = ($nesting == 0);
-
-            if ($token instanceof HTMLPurifier_Token_Start) {
-                $nesting++;
-            } elseif ($token instanceof HTMLPurifier_Token_End) {
-                $nesting--;
-            }
-
-            if ($is_child) {
-                $is_deleting = false;
-                if (!isset($this->elements[$token->name])) {
-                    $is_deleting = true;
-                    if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
-                        $result[] = $token;
-                    } elseif ($pcdata_allowed && $escape_invalid_children) {
-                        $result[] = new HTMLPurifier_Token_Text(
-                            $gen->generateFromToken($token)
-                        );
+            if (!isset($this->elements[$node->name])) {
+                // special case text
+                // XXX One of these ought to be redundant or something
+                if ($pcdata_allowed && $node instanceof HTMLPurifier_Node_Text) {
+                    $result[] = $node;
+                    continue;
+                }
+                // spill the child contents in
+                // ToDo: Make configurable
+                if ($node instanceof HTMLPurifier_Node_Element) {
+                    for ($i = count($node->children) - 1; $i >= 0; $i--) {
+                        $stack[] = $node->children[$i];
                     }
                     continue;
                 }
+                continue;
             }
-            if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
-                $result[] = $token;
-            } elseif ($pcdata_allowed && $escape_invalid_children) {
-                $result[] =
-                    new HTMLPurifier_Token_Text(
-                        $gen->generateFromToken($token)
-                    );
-            } else {
-                // drop silently
-            }
+            $result[] = $node;
         }
         if (empty($result)) {
             return false;
@@ -135,9 +111,6 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
             $this->whitespace = true;
             return false;
         }
-        if ($tokens_of_children == $result) {
-            return true;
-        }
         return $result;
     }
 }
diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php
index 89831a71..3270a46e 100644
--- a/library/HTMLPurifier/ChildDef/StrictBlockquote.php
+++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php
@@ -43,69 +43,51 @@ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Requi
     }
 
     /**
-     * @param array $tokens_of_children
+     * @param array $children
      * @param HTMLPurifier_Config $config
      * @param HTMLPurifier_Context $context
      * @return array
      */
-    public function validateChildren($tokens_of_children, $config, $context)
+    public function validateChildren($children, $config, $context)
     {
         $this->init($config);
 
         // trick the parent class into thinking it allows more
         $this->elements = $this->fake_elements;
-        $result = parent::validateChildren($tokens_of_children, $config, $context);
+        $result = parent::validateChildren($children, $config, $context);
         $this->elements = $this->real_elements;
 
         if ($result === false) {
             return array();
         }
         if ($result === true) {
-            $result = $tokens_of_children;
+            $result = $children;
         }
 
         $def = $config->getHTMLDefinition();
-        $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
-        $block_wrap_end = new HTMLPurifier_Token_End($def->info_block_wrapper);
-        $is_inline = false;
-        $depth = 0;
+        $block_wrap_name = $def->info_block_wrapper;
+        $block_wrap = false;
         $ret = array();
 
-        // assuming that there are no comment tokens
-        foreach ($result as $i => $token) {
-            $token = $result[$i];
-            // ifs are nested for readability
-            if (!$is_inline) {
-                if (!$depth) {
-                    if (($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
-                        (!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))) {
-                        $is_inline = true;
-                        $ret[] = $block_wrap_start;
-                    }
+        foreach ($result as $node) {
+            if ($block_wrap === false) {
+                if (($node instanceof HTMLPurifier_Node_Text && !$node->is_whitespace) ||
+                    ($node instanceof HTMLPurifier_Node_Element && !isset($this->elements[$node->name]))) {
+                        $block_wrap = new HTMLPurifier_Node_Element($def->info_block_wrapper);
+                        $ret[] = $block_wrap;
                 }
             } else {
-                if (!$depth) {
-                    // starting tokens have been inline text / empty
-                    if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
-                        if (isset($this->elements[$token->name])) {
-                            // ended
-                            $ret[] = $block_wrap_end;
-                            $is_inline = false;
-                        }
-                    }
+                if ($node instanceof HTMLPurifier_Node_Element && isset($this->elements[$node->name])) {
+                    $block_wrap = false;
+
                 }
             }
-            $ret[] = $token;
-            if ($token instanceof HTMLPurifier_Token_Start) {
-                $depth++;
-            }
-            if ($token instanceof HTMLPurifier_Token_End) {
-                $depth--;
+            if ($block_wrap) {
+                $block_wrap->children[] = $node;
+            } else {
+                $ret[] = $node;
             }
         }
-        if ($is_inline) {
-            $ret[] = $block_wrap_end;
-        }
         return $ret;
     }
 
diff --git a/library/HTMLPurifier/ChildDef/Table.php b/library/HTMLPurifier/ChildDef/Table.php
dissimilarity index 75%
index cd1de1d4..3e4a0f21 100644
--- a/library/HTMLPurifier/ChildDef/Table.php
+++ b/library/HTMLPurifier/ChildDef/Table.php
@@ -1,290 +1,224 @@
-<?php
-
-/**
- * Definition for tables.  The general idea is to extract out all of the
- * essential bits, and then reconstruct it later.
- *
- * This is a bit confusing, because the DTDs and the W3C
- * validators seem to disagree on the appropriate definition. The
- * DTD claims:
- *
- *      (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
- *
- * But actually, the HTML4 spec then has this to say:
- *
- *      The TBODY start tag is always required except when the table
- *      contains only one table body and no table head or foot sections.
- *      The TBODY end tag may always be safely omitted.
- *
- * So the DTD is kind of wrong.  The validator is, unfortunately, kind
- * of on crack.
- *
- * The definition changed again in XHTML1.1; and in my opinion, this
- * formulation makes the most sense.
- *
- *      caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
- *
- * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
- * If we encounter a thead, tfoot or tbody, we are placed in the former
- * mode, and we *must* wrap any stray tr segments with a tbody. But if
- * we don't run into any of them, just have tr tags is OK.
- */
-class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
-{
-    /**
-     * @type bool
-     */
-    public $allow_empty = false;
-
-    /**
-     * @type string
-     */
-    public $type = 'table';
-
-    /**
-     * @type array
-     */
-    public $elements = array(
-        'tr' => true,
-        'tbody' => true,
-        'thead' => true,
-        'tfoot' => true,
-        'caption' => true,
-        'colgroup' => true,
-        'col' => true
-    );
-
-    public function __construct()
-    {
-    }
-
-    /**
-     * @param array $tokens_of_children
-     * @param HTMLPurifier_Config $config
-     * @param HTMLPurifier_Context $context
-     * @return array
-     */
-    public function validateChildren($tokens_of_children, $config, $context)
-    {
-        if (empty($tokens_of_children)) {
-            return false;
-        }
-
-        // this ensures that the loop gets run one last time before closing
-        // up. It's a little bit of a hack, but it works! Just make sure you
-        // get rid of the token later.
-        $tokens_of_children[] = false;
-
-        // only one of these elements is allowed in a table
-        $caption = false;
-        $thead = false;
-        $tfoot = false;
-
-        // as many of these as you want
-        $cols = array();
-        $content = array();
-
-        $nesting = 0; // current depth so we can determine nodes
-        $is_collecting = false; // are we globbing together tokens to package
-        // into one of the collectors?
-        $collection = array(); // collected nodes
-        // INVARIANT: if $is_collecting, then !empty($collection)
-        // The converse does NOT hold, see [WHITESPACE]
-        $tag_index = 0; // the first node might be whitespace,
-        // so this tells us where the start tag is
-        $tbody_mode = false; // if true, then we need to wrap any stray
-        // <tr>s with a <tbody>.
-
-        foreach ($tokens_of_children as $token) {
-            $is_child = ($nesting == 0);
-
-            if ($token === false) {
-                // terminating sequence started
-            } elseif ($token instanceof HTMLPurifier_Token_Start) {
-                $nesting++;
-            } elseif ($token instanceof HTMLPurifier_Token_End) {
-                $nesting--;
-            }
-
-            // handle node collection
-            if ($is_collecting) {
-                if ($is_child) {
-                    // okay, let's stash the tokens away
-                    // first token tells us the type of the collection
-                    switch ($collection[$tag_index]->name) {
-                        case 'tbody':
-                            $tbody_mode = true;
-                            // fall through
-                        case 'tr':
-                            $content[] = $collection;
-                            break;
-                        case 'caption':
-                            if ($caption !== false) {
-                                break;
-                            }
-                            $caption = $collection;
-                            break;
-                        case 'thead':
-                        case 'tfoot':
-                            $tbody_mode = true;
-                            // XXX This breaks rendering properties with
-                            // Firefox, which never floats a <thead> to
-                            // the top. Ever. (Our scheme will float the
-                            // first <thead> to the top.)  So maybe
-                            // <thead>s that are not first should be
-                            // turned into <tbody>? Very tricky, indeed.
-
-                            // access the appropriate variable, $thead or $tfoot
-                            $var = $collection[$tag_index]->name;
-                            if ($$var === false) {
-                                $$var = $collection;
-                            } else {
-                                // Oops, there's a second one! What
-                                // should we do?  Current behavior is to
-                                // transmutate the first and last entries into
-                                // tbody tags, and then put into content.
-                                // Maybe a better idea is to *attach
-                                // it* to the existing thead or tfoot?
-                                // We don't do this, because Firefox
-                                // doesn't float an extra tfoot to the
-                                // bottom like it does for the first one.
-                                $collection[$tag_index]->name = 'tbody';
-                                $collection[count($collection) - 1]->name = 'tbody';
-                                $content[] = $collection;
-                            }
-                            break;
-                        case 'colgroup':
-                            $cols[] = $collection;
-                            break;
-                    }
-                    $collection = array();
-                    $is_collecting = false;
-                    $tag_index = 0;
-                } else {
-                    // add the node to the collection
-                    $collection[] = $token;
-                }
-            }
-
-            // terminate
-            if ($token === false) {
-                break;
-            }
-
-            if ($is_child) {
-                // determine what we're dealing with
-                if ($token->name == 'col') {
-                    // the only empty tag in the possie, we can handle it
-                    // immediately
-                    $cols[] = array_merge($collection, array($token));
-                    $collection = array();
-                    $is_collecting = false;
-                    $tag_index = 0;
-                    continue;
-                }
-                switch ($token->name) {
-                    case 'caption':
-                    case 'colgroup':
-                    case 'thead':
-                    case 'tfoot':
-                    case 'tbody':
-                    case 'tr':
-                        $is_collecting = true;
-                        $collection[] = $token;
-                        continue;
-                    default:
-                        // [WHITESPACE] Whitespace is added to the
-                        // collection without triggering collection
-                        // mode. This is a hack to make whitespace
-                        // 'sticky' (that is to say, we ought /not/ to
-                        // drop whitespace.)
-                        if (!empty($token->is_whitespace)) {
-                            $collection[] = $token;
-                            $tag_index++;
-                        }
-                        continue;
-                }
-            }
-        }
-
-        if (empty($content)) {
-            return false;
-        }
-        // INVARIANT: all members of content are non-empty.  This can
-        // be shown by observing when things are pushed onto content:
-        // they are only ever pushed when is_collecting is true, and
-        // collection is the only thing ever pushed; but it is known
-        // that collections are non-empty when is_collecting is true.
-
-        $ret = array();
-        if ($caption !== false) {
-            $ret = array_merge($ret, $caption);
-        }
-        if ($cols !== false) {
-            foreach ($cols as $token_array) {
-                $ret = array_merge($ret, $token_array);
-            }
-        }
-        if ($thead !== false) {
-            $ret = array_merge($ret, $thead);
-        }
-        if ($tfoot !== false) {
-            $ret = array_merge($ret, $tfoot);
-        }
-
-        if ($tbody_mode) {
-            // a little tricky, since the start of the collection may be
-            // whitespace
-            $inside_tbody = false;
-            foreach ($content as $token_array) {
-                // find the starting token
-                // INVARIANT: token_array is not empty
-                $t = NULL;
-                foreach ($token_array as $t) {
-                    if ($t->name === 'tr' || $t->name === 'tbody') {
-                        break;
-                    }
-                } // iterator variable carries over
-                if ($t->name === 'tr') {
-                    if ($inside_tbody) {
-                        $ret = array_merge($ret, $token_array);
-                    } else {
-                        $ret[] = new HTMLPurifier_Token_Start('tbody');
-                        $ret = array_merge($ret, $token_array);
-                        $inside_tbody = true;
-                    }
-                } elseif ($t->name === 'tbody') {
-                    if ($inside_tbody) {
-                        $ret[] = new HTMLPurifier_Token_End('tbody');
-                        $inside_tbody = false;
-                        $ret = array_merge($ret, $token_array);
-                    } else {
-                        $ret = array_merge($ret, $token_array);
-                    }
-                } else {
-                    trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
-                }
-            }
-            if ($inside_tbody) {
-                $ret[] = new HTMLPurifier_Token_End('tbody');
-            }
-        } else {
-            foreach ($content as $token_array) {
-                // invariant: everything in here is <tr>s
-                $ret = array_merge($ret, $token_array);
-            }
-        }
-
-        if (!empty($collection) && $is_collecting == false) {
-            // grab the trailing space
-            $ret = array_merge($ret, $collection);
-        }
-
-        array_pop($tokens_of_children); // remove phantom token
-
-        return ($ret === $tokens_of_children) ? true : $ret;
-
-    }
-}
-
-// vim: et sw=4 sts=4
+<?php
+
+/**
+ * Definition for tables.  The general idea is to extract out all of the
+ * essential bits, and then reconstruct it later.
+ *
+ * This is a bit confusing, because the DTDs and the W3C
+ * validators seem to disagree on the appropriate definition. The
+ * DTD claims:
+ *
+ *      (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
+ *
+ * But actually, the HTML4 spec then has this to say:
+ *
+ *      The TBODY start tag is always required except when the table
+ *      contains only one table body and no table head or foot sections.
+ *      The TBODY end tag may always be safely omitted.
+ *
+ * So the DTD is kind of wrong.  The validator is, unfortunately, kind
+ * of on crack.
+ *
+ * The definition changed again in XHTML1.1; and in my opinion, this
+ * formulation makes the most sense.
+ *
+ *      caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))
+ *
+ * Essentially, we have two modes: thead/tfoot/tbody mode, and tr mode.
+ * If we encounter a thead, tfoot or tbody, we are placed in the former
+ * mode, and we *must* wrap any stray tr segments with a tbody. But if
+ * we don't run into any of them, just have tr tags is OK.
+ */
+class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
+{
+    /**
+     * @type bool
+     */
+    public $allow_empty = false;
+
+    /**
+     * @type string
+     */
+    public $type = 'table';
+
+    /**
+     * @type array
+     */
+    public $elements = array(
+        'tr' => true,
+        'tbody' => true,
+        'thead' => true,
+        'tfoot' => true,
+        'caption' => true,
+        'colgroup' => true,
+        'col' => true
+    );
+
+    public function __construct()
+    {
+    }
+
+    /**
+     * @param array $children
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array
+     */
+    public function validateChildren($children, $config, $context)
+    {
+        if (empty($children)) {
+            return false;
+        }
+
+        // only one of these elements is allowed in a table
+        $caption = false;
+        $thead = false;
+        $tfoot = false;
+
+        // whitespace
+        $initial_ws = array();
+        $after_caption_ws = array();
+        $after_thead_ws = array();
+        $after_tfoot_ws = array();
+
+        // as many of these as you want
+        $cols = array();
+        $content = array();
+
+        $tbody_mode = false; // if true, then we need to wrap any stray
+                             // <tr>s with a <tbody>.
+
+        $ws_accum =& $initial_ws;
+
+        foreach ($children as $node) {
+            if ($node instanceof HTMLPurifier_Node_Comment) {
+                $ws_accum[] = $node;
+                continue;
+            }
+            switch ($node->name) {
+            case 'tbody':
+                $tbody_mode = true;
+                // fall through
+            case 'tr':
+                $content[] = $node;
+                $ws_accum =& $content;
+                break;
+            case 'caption':
+                // there can only be one caption!
+                if ($caption !== false)  break;
+                $caption = $node;
+                $ws_accum =& $after_caption_ws;
+                break;
+            case 'thead':
+                $tbody_mode = true;
+                // XXX This breaks rendering properties with
+                // Firefox, which never floats a <thead> to
+                // the top. Ever. (Our scheme will float the
+                // first <thead> to the top.)  So maybe
+                // <thead>s that are not first should be
+                // turned into <tbody>? Very tricky, indeed.
+                if ($thead === false) {
+                    $thead = $node;
+                    $ws_accum =& $after_thead_ws;
+                } else {
+                    // Oops, there's a second one! What
+                    // should we do?  Current behavior is to
+                    // transmutate the first and last entries into
+                    // tbody tags, and then put into content.
+                    // Maybe a better idea is to *attach
+                    // it* to the existing thead or tfoot?
+                    // We don't do this, because Firefox
+                    // doesn't float an extra tfoot to the
+                    // bottom like it does for the first one.
+                    $node->name = 'tbody';
+                    $content[] = $node;
+                    $ws_accum =& $content;
+                }
+                break;
+            case 'tfoot':
+                // see above for some aveats
+                $tbody_mode = true;
+                if ($tfoot === false) {
+                    $tfoot = $node;
+                    $ws_accum =& $after_tfoot_ws;
+                } else {
+                    $node->name = 'tbody';
+                    $content[] = $node;
+                    $ws_accum =& $content;
+                }
+                break;
+            case 'colgroup':
+            case 'col':
+                $cols[] = $node;
+                $ws_accum =& $cols;
+                break;
+            case '#PCDATA':
+                // How is whitespace handled? We treat is as sticky to
+                // the *end* of the previous element. So all of the
+                // nonsense we have worked on is to keep things
+                // together.
+                if (!empty($node->is_whitespace)) {
+                    $ws_accum[] = $node;
+                }
+                break;
+            }
+        }
+
+        if (empty($content)) {
+            return false;
+        }
+
+        $ret = $initial_ws;
+        if ($caption !== false) {
+            $ret[] = $caption;
+            $ret = array_merge($ret, $after_caption_ws);
+        }
+        if ($cols !== false) {
+            $ret = array_merge($ret, $cols);
+        }
+        if ($thead !== false) {
+            $ret[] = $thead;
+            $ret = array_merge($ret, $after_thead_ws);
+        }
+        if ($tfoot !== false) {
+            $ret[] = $tfoot;
+            $ret = array_merge($ret, $after_tfoot_ws);
+        }
+
+        if ($tbody_mode) {
+            // we have to shuffle tr into tbody
+            $current_tr_tbody = null;
+
+            foreach($content as $node) {
+                switch ($node->name) {
+                case 'tbody':
+                    $current_tr_tbody = null;
+                    $ret[] = $node;
+                    break;
+                case 'tr':
+                    if ($current_tr_tbody === null) {
+                        $current_tr_tbody = new HTMLPurifier_Node_Element('tbody');
+                        $ret[] = $current_tr_tbody;
+                    }
+                    $current_tr_tbody->children[] = $node;
+                    break;
+                case '#PCDATA':
+                    assert($node->is_whitespace);
+                    if ($current_tr_tbody === null) {
+                        $ret[] = $node;
+                    } else {
+                        $current_tr_tbody->children[] = $node;
+                    }
+                    break;
+                }
+            }
+        } else {
+            $ret = array_merge($ret, $content);
+        }
+
+        return $ret;
+
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt b/library/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt
index 4d5b5055..a3881be7 100644
--- a/library/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt
+++ b/library/HTMLPurifier/ConfigSchema/schema/Core.EscapeInvalidChildren.txt
@@ -2,9 +2,11 @@ Core.EscapeInvalidChildren
 TYPE: bool
 DEFAULT: false
 --DESCRIPTION--
-When true, a child is found that is not allowed in the context of the
+<p><strong>Warning:</strong> this configuration option is no longer does anything as of 4.6.0.</p>
+
+<p>When true, a child is found that is not allowed in the context of the
 parent element will be transformed into text as if it were ASCII. When
 false, that element and all internal tags will be dropped, though text will
 be preserved.  There is no option for dropping the element but preserving
-child nodes.
+child nodes.</p>
 --# vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/Node.php b/library/HTMLPurifier/Node.php
index 9e239b3c..3995fec9 100644
--- a/library/HTMLPurifier/Node.php
+++ b/library/HTMLPurifier/Node.php
@@ -30,6 +30,15 @@ abstract class HTMLPurifier_Node
     public $armor = array();
 
     /**
+     * When true, this node should be ignored as non-existent.
+     *
+     * Who is responsible for ignoring dead nodes?  FixNesting is
+     * responsible for removing them before passing on to child
+     * validators.
+     */
+    public $dead = false;
+
+    /**
      * Returns a pair of start and end tokens, where the end token
      * is null if it is not necessary. Does not include children.
      * @type array
diff --git a/library/HTMLPurifier/Node/Text.php b/library/HTMLPurifier/Node/Text.php
index 03dc1b20..aec91664 100644
--- a/library/HTMLPurifier/Node/Text.php
+++ b/library/HTMLPurifier/Node/Text.php
@@ -13,6 +13,13 @@ class HTMLPurifier_Node_Text extends HTMLPurifier_Node
 {
 
     /**
+     * PCDATA tag name compatible with DTD, see
+     * HTMLPurifier_ChildDef_Custom for details.
+     * @type string
+     */
+    public $name = '#PCDATA';
+
+    /**
      * @type string
      */
     public $data;
diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php
dissimilarity index 78%
index f78ad086..6fa673db 100644
--- a/library/HTMLPurifier/Strategy/FixNesting.php
+++ b/library/HTMLPurifier/Strategy/FixNesting.php
@@ -1,363 +1,181 @@
-<?php
-
-/**
- * Takes a well formed list of tokens and fixes their nesting.
- *
- * HTML elements dictate which elements are allowed to be their children,
- * for example, you can't have a p tag in a span tag.  Other elements have
- * much more rigorous definitions: tables, for instance, require a specific
- * order for their elements.  There are also constraints not expressible by
- * document type definitions, such as the chameleon nature of ins/del
- * tags and global child exclusions.
- *
- * The first major objective of this strategy is to iterate through all the
- * nodes (not tokens) of the list of tokens and determine whether or not
- * their children conform to the element's definition.  If they do not, the
- * child definition may optionally supply an amended list of elements that
- * is valid or require that the entire node be deleted (and the previous
- * node rescanned).
- *
- * The second objective is to ensure that explicitly excluded elements of
- * an element do not appear in its children.  Code that accomplishes this
- * task is pervasive through the strategy, though the two are distinct tasks
- * and could, theoretically, be seperated (although it's not recommended).
- *
- * @note Whether or not unrecognized children are silently dropped or
- *       translated into text depends on the child definitions.
- *
- * @todo Enable nodes to be bubbled out of the structure.
- *
- * @warning This algorithm (though it may be hard to see) proceeds from
- *          a top-down fashion.  Thus, parents are processed before
- *          children.  This is easy to implement and has a nice effiency
- *          benefit, in that if a node is removed, we never waste any
- *          time processing it, but it also means that if a child
- *          changes in a non-encapsulated way (e.g. it is removed), we
- *          need to go back and reprocess the parent to see if those
- *          changes resulted in problems for the parent.  See
- *          [BACKTRACK] for an example of this.  In the current
- *          implementation, this backtracking can only be triggered when
- *          a node is removed and if that node was the sole node, the
- *          parent would need to be removed.  As such, it is easy to see
- *          that backtracking only incurs constant overhead.  If more
- *          sophisticated backtracking is implemented, care must be
- *          taken to avoid nontermination or exponential blowup.
- */
-
-class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
-{
-
-    /**
-     * @param HTMLPurifier_Token[] $tokens
-     * @param HTMLPurifier_Config $config
-     * @param HTMLPurifier_Context $context
-     * @return array|HTMLPurifier_Token[]
-     */
-    public function execute($tokens, $config, $context)
-    {
-        //####################################################################//
-        // Pre-processing
-
-        //$node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
-        //$new_tokens = HTMLPurifier_Arborize::flatten($node, $config, $context);
-
-        // get a copy of the HTML definition
-        $definition = $config->getHTMLDefinition();
-
-        $excludes_enabled = !$config->get('Core.DisableExcludes');
-
-        // insert implicit "parent" node, will be removed at end.
-        // DEFINITION CALL
-        $parent_name = $definition->info_parent;
-        array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
-        $tokens[] = new HTMLPurifier_Token_End($parent_name);
-
-        // setup the context variable 'IsInline', for chameleon processing
-        // is 'false' when we are not inline, 'true' when it must always
-        // be inline, and an integer when it is inline for a certain
-        // branch of the document tree
-        $is_inline = $definition->info_parent_def->descendants_are_inline;
-        $context->register('IsInline', $is_inline);
-
-        // setup error collector
-        $e =& $context->get('ErrorCollector', true);
-
-        //####################################################################//
-        // Loop initialization
-
-        // stack that contains the indexes of all parents,
-        // $stack[count($stack)-1] being the current parent
-        $stack = array();
-
-        // stack that contains all elements that are excluded
-        // it is organized by parent elements, similar to $stack,
-        // but it is only populated when an element with exclusions is
-        // processed, i.e. there won't be empty exclusions.
-        $exclude_stack = array();
-
-        // variable that contains the start token while we are processing
-        // nodes. This enables error reporting to do its job
-        $start_token = false;
-        $context->register('CurrentToken', $start_token);
-
-        //####################################################################//
-        // Loop
-
-        // iterate through all start nodes. Determining the start node
-        // is complicated so it has been omitted from the loop construct
-        for ($i = 0, $size = count($tokens); $i < $size;) {
-
-            //################################################################//
-            // Gather information on children
-
-            // child token accumulator
-            $child_tokens = array();
-
-            // scroll to the end of this node, report number, and collect
-            // all children
-            for ($j = $i, $depth = 0; ; $j++) {
-                if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
-                    $depth++;
-                    // skip token assignment on first iteration, this is the
-                    // token we currently are on
-                    if ($depth == 1) {
-                        continue;
-                    }
-                } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
-                    $depth--;
-                    // skip token assignment on last iteration, this is the
-                    // end token of the token we're currently on
-                    if ($depth == 0) {
-                        break;
-                    }
-                }
-                $child_tokens[] = $tokens[$j];
-            }
-
-            // $i is index of start token
-            // $j is index of end token
-
-            $start_token = $tokens[$i]; // to make token available via CurrentToken
-
-            //################################################################//
-            // Gather information on parent
-
-            // calculate parent information
-            if ($count = count($stack)) {
-                $parent_index = $stack[$count - 1];
-                $parent_name = $tokens[$parent_index]->name;
-                if ($parent_index == 0) {
-                    $parent_def = $definition->info_parent_def;
-                } else {
-                    $parent_def = $definition->info[$parent_name];
-                }
-            } else {
-                // processing as if the parent were the "root" node
-                // unknown info, it won't be used anyway, in the future,
-                // we may want to enforce one element only (this is
-                // necessary for HTML Purifier to clean entire documents
-                $parent_index = $parent_name = $parent_def = null;
-            }
-
-            // calculate context
-            if ($is_inline === false) {
-                // check if conditions make it inline
-                if (!empty($parent_def) && $parent_def->descendants_are_inline) {
-                    $is_inline = $count - 1;
-                }
-            } else {
-                // check if we're out of inline
-                if ($count === $is_inline) {
-                    $is_inline = false;
-                }
-            }
-
-            //################################################################//
-            // Determine whether element is explicitly excluded SGML-style
-
-            // determine whether or not element is excluded by checking all
-            // parent exclusions. The array should not be very large, two
-            // elements at most.
-            $excluded = false;
-            if (!empty($exclude_stack) && $excludes_enabled) {
-                foreach ($exclude_stack as $lookup) {
-                    if (isset($lookup[$tokens[$i]->name])) {
-                        $excluded = true;
-                        // no need to continue processing
-                        break;
-                    }
-                }
-            }
-
-            //################################################################//
-            // Perform child validation
-
-            if ($excluded) {
-                // there is an exclusion, remove the entire node
-                $result = false;
-                $excludes = array(); // not used, but good to initialize anyway
-            } else {
-                // DEFINITION CALL
-                if ($i === 0) {
-                    // special processing for the first node
-                    $def = $definition->info_parent_def;
-                } else {
-                    $def = $definition->info[$tokens[$i]->name];
-
-                }
-
-                if (!empty($def->child)) {
-                    // have DTD child def validate children
-                    $result = $def->child->validateChildren(
-                        $child_tokens,
-                        $config,
-                        $context
-                    );
-                } else {
-                    // weird, no child definition, get rid of everything
-                    $result = false;
-                }
-
-                // determine whether or not this element has any exclusions
-                $excludes = $def->excludes;
-            }
-
-            // $result is now a bool or array
-
-            //################################################################//
-            // Process result by interpreting $result
-
-            if ($result === true || $child_tokens === $result) {
-                // leave the node as is
-
-                // register start token as a parental node start
-                $stack[] = $i;
-
-                // register exclusions if there are any
-                if (!empty($excludes)) {
-                    $exclude_stack[] = $excludes;
-                }
-
-                // move cursor to next possible start node
-                $i++;
-
-            } elseif ($result === false) {
-                // remove entire node
-
-                if ($e) {
-                    if ($excluded) {
-                        $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
-                    } else {
-                        $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
-                    }
-                }
-
-                // calculate length of inner tokens and current tokens
-                $length = $j - $i + 1;
-
-                // perform removal
-                array_splice($tokens, $i, $length);
-
-                // update size
-                $size -= $length;
-
-                // there is no start token to register,
-                // current node is now the next possible start node
-                // unless it turns out that we need to do a double-check
-
-                // this is a rought heuristic that covers 100% of HTML's
-                // cases and 99% of all other cases. A child definition
-                // that would be tricked by this would be something like:
-                // ( | a b c) where it's all or nothing. Fortunately,
-                // our current implementation claims that that case would
-                // not allow empty, even if it did
-                if (!$parent_def->child->allow_empty) {
-                    // we need to do a double-check [BACKTRACK]
-                    $i = $parent_index;
-                    array_pop($stack);
-                }
-
-                // PROJECTED OPTIMIZATION: Process all children elements before
-                // reprocessing parent node.
-
-            } else {
-                // replace node with $result
-
-                // calculate length of inner tokens
-                $length = $j - $i - 1;
-
-                if ($e) {
-                    if (empty($result) && $length) {
-                        $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
-                    } else {
-                        $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
-                    }
-                }
-
-                // perform replacement
-                array_splice($tokens, $i + 1, $length, $result);
-
-                // update size
-                $size -= $length;
-                $size += count($result);
-
-                // register start token as a parental node start
-                $stack[] = $i;
-
-                // register exclusions if there are any
-                if (!empty($excludes)) {
-                    $exclude_stack[] = $excludes;
-                }
-
-                // move cursor to next possible start node
-                $i++;
-            }
-
-            //################################################################//
-            // Scroll to next start node
-
-            // We assume, at this point, that $i is the index of the token
-            // that is the first possible new start point for a node.
-
-            // Test if the token indeed is a start tag, if not, move forward
-            // and test again.
-            $size = count($tokens);
-            while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
-                if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
-                    // pop a token index off the stack if we ended a node
-                    array_pop($stack);
-                    // pop an exclusion lookup off exclusion stack if
-                    // we ended node and that node had exclusions
-                    if ($i == 0 || $i == $size - 1) {
-                        // use specialized var if it's the super-parent
-                        $s_excludes = $definition->info_parent_def->excludes;
-                    } else {
-                        $s_excludes = $definition->info[$tokens[$i]->name]->excludes;
-                    }
-                    if ($s_excludes) {
-                        array_pop($exclude_stack);
-                    }
-                }
-                $i++;
-            }
-
-        }
-
-        //####################################################################//
-        // Post-processing
-
-        // remove implicit parent tokens at the beginning and end
-        array_shift($tokens);
-        array_pop($tokens);
-
-        // remove context variables
-        $context->destroy('IsInline');
-        $context->destroy('CurrentToken');
-
-        //####################################################################//
-        // Return
-        return $tokens;
-    }
-}
-
-// vim: et sw=4 sts=4
+<?php
+
+/**
+ * Takes a well formed list of tokens and fixes their nesting.
+ *
+ * HTML elements dictate which elements are allowed to be their children,
+ * for example, you can't have a p tag in a span tag.  Other elements have
+ * much more rigorous definitions: tables, for instance, require a specific
+ * order for their elements.  There are also constraints not expressible by
+ * document type definitions, such as the chameleon nature of ins/del
+ * tags and global child exclusions.
+ *
+ * The first major objective of this strategy is to iterate through all
+ * the nodes and determine whether or not their children conform to the
+ * element's definition.  If they do not, the child definition may
+ * optionally supply an amended list of elements that is valid or
+ * require that the entire node be deleted (and the previous node
+ * rescanned).
+ *
+ * The second objective is to ensure that explicitly excluded elements of
+ * an element do not appear in its children.  Code that accomplishes this
+ * task is pervasive through the strategy, though the two are distinct tasks
+ * and could, theoretically, be seperated (although it's not recommended).
+ *
+ * @note Whether or not unrecognized children are silently dropped or
+ *       translated into text depends on the child definitions.
+ *
+ * @todo Enable nodes to be bubbled out of the structure.  This is
+ *       easier with our new algorithm.
+ */
+
+class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
+{
+
+    /**
+     * @param HTMLPurifier_Token[] $tokens
+     * @param HTMLPurifier_Config $config
+     * @param HTMLPurifier_Context $context
+     * @return array|HTMLPurifier_Token[]
+     */
+    public function execute($tokens, $config, $context)
+    {
+
+        //####################################################################//
+        // Pre-processing
+
+        // O(n) pass to convert to a tree, so that we can efficiently
+        // refer to substrings
+        $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
+
+        // get a copy of the HTML definition
+        $definition = $config->getHTMLDefinition();
+
+        $excludes_enabled = !$config->get('Core.DisableExcludes');
+
+        // setup the context variable 'IsInline', for chameleon processing
+        // is 'false' when we are not inline, 'true' when it must always
+        // be inline, and an integer when it is inline for a certain
+        // branch of the document tree
+        $is_inline = $definition->info_parent_def->descendants_are_inline;
+        $context->register('IsInline', $is_inline);
+
+        // setup error collector
+        $e =& $context->get('ErrorCollector', true);
+
+        //####################################################################//
+        // Loop initialization
+
+        // stack that contains all elements that are excluded
+        // it is organized by parent elements, similar to $stack,
+        // but it is only populated when an element with exclusions is
+        // processed, i.e. there won't be empty exclusions.
+        $exclude_stack = array($definition->info_parent_def->excludes);
+
+        // variable that contains the start token while we are processing
+        // nodes. This enables error reporting to do its job
+        $node = $top_node;
+        // dummy token
+        list($token, $d) = $node->toTokenPair();
+        $context->register('CurrentNode', $node);
+        $context->register('CurrentToken', $token);
+
+        //####################################################################//
+        // Loop
+
+        // We need to implement a post-order traversal iteratively, to
+        // avoid running into stack space limits.  This is pretty tricky
+        // to reason about, so we just manually stack-ify the recursive
+        // variant:
+        //
+        //  function f($node) {
+        //      foreach ($node->children as $child) {
+        //          f($child);
+        //      }
+        //      validate($node);
+        //  }
+        //
+        // Thus, we will represent a stack frame as array($node,
+        // $is_inline, stack of children)
+        // e.g. array_reverse($node->children) - already processed
+        // children.
+
+        $parent_def = $definition->info_parent_def;
+        $stack = array(
+            array($top_node,
+                  $parent_def->descendants_are_inline,
+                  $parent_def->excludes, // exclusions
+                  0)
+            );
+
+        while (!empty($stack)) {
+            list($node, $is_inline, $excludes, $ix) = array_pop($stack);
+            // recursive call
+            $go = false;
+            $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
+            while (isset($node->children[$ix])) {
+                $child = $node->children[$ix++];
+                if ($child instanceof HTMLPurifier_Node_Element) {
+                    $go = true;
+                    $stack[] = array($node, $is_inline, $excludes, $ix);
+                    $stack[] = array($child,
+                        // ToDo: I don't think it matters if it's def or
+                        // child_def, but double check this...
+                        $is_inline || $def->descendants_are_inline,
+                        empty($def->excludes) ? $excludes
+                                              : array_merge($excludes, $def->excludes),
+                        0);
+                    break;
+                }
+            };
+            if ($go) continue;
+            list($token, $d) = $node->toTokenPair();
+            // base case
+            if ($excludes_enabled && isset($excludes[$node->name])) {
+                $node->dead = true;
+                if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
+            } else {
+                // XXX I suppose it would be slightly more efficient to
+                // avoid the allocation here and have children
+                // strategies handle it
+                $children = array();
+                foreach ($node->children as $child) {
+                    if (!$child->dead) $children[] = $child;
+                }
+                $result = $def->child->validateChildren($children, $config, $context);
+                if ($result === true) {
+                    // nop
+                    $node->children = $children;
+                } elseif ($result === false) {
+                    $node->dead = true;
+                    if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
+                } else {
+                    $node->children = $result;
+                    if ($e) {
+                        // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
+                        if (empty($result) && !empty($children)) {
+                            $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
+                        } else if ($result != $children) {
+                            $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
+                        }
+                    }
+                }
+            }
+        }
+
+        //####################################################################//
+        // Post-processing
+
+        // remove context variables
+        $context->destroy('IsInline');
+        $context->destroy('CurrentNode');
+        $context->destroy('CurrentToken');
+
+        //####################################################################//
+        // Return
+
+        return HTMLPurifier_Arborize::flatten($node, $config, $context);
+    }
+}
+
+// vim: et sw=4 sts=4
diff --git a/tests/HTMLPurifier/ChildDef/CustomTest.php b/tests/HTMLPurifier/ChildDef/CustomTest.php
index 55327e83..0094323d 100644
--- a/tests/HTMLPurifier/ChildDef/CustomTest.php
+++ b/tests/HTMLPurifier/ChildDef/CustomTest.php
@@ -3,6 +3,11 @@
 class HTMLPurifier_ChildDef_CustomTest extends HTMLPurifier_ChildDefHarness
 {
 
+    public function setUp()
+    {
+        parent::setUp();
+    }
+
     public function test()
     {
         $this->obj = new HTMLPurifier_ChildDef_Custom('(a,b?,c*,d+,(a,b)*)');
diff --git a/tests/HTMLPurifier/ChildDef/ListTest.php b/tests/HTMLPurifier/ChildDef/ListTest.php
index 694ce378..0e3d5c72 100644
--- a/tests/HTMLPurifier/ChildDef/ListTest.php
+++ b/tests/HTMLPurifier/ChildDef/ListTest.php
@@ -24,12 +24,6 @@ class HTMLPurifier_ChildDef_ListTest extends HTMLPurifier_ChildDefHarness
         $this->assertResult('<li>asdf</li><li />');
     }
 
-    public function testIllegal()
-    {
-        // XXX actually this never gets triggered in practice
-        $this->assertResult('<li /><b />', '<li /><li><b /></li>');
-    }
-
     public function testOlAtBeginning()
     {
         $this->assertResult('<ol />', '<li><ol /></li>');
diff --git a/tests/HTMLPurifier/ChildDef/RequiredTest.php b/tests/HTMLPurifier/ChildDef/RequiredTest.php
index 14638858..d4f7c988 100644
--- a/tests/HTMLPurifier/ChildDef/RequiredTest.php
+++ b/tests/HTMLPurifier/ChildDef/RequiredTest.php
@@ -68,15 +68,10 @@ class HTMLPurifier_ChildDef_RequiredTest extends HTMLPurifier_ChildDefHarness
         $this->obj = new HTMLPurifier_ChildDef_Required('#PCDATA | b');
         $this->assertResult('Out <b>Bold text</b><img />', 'Out <b>Bold text</b>');
     }
-
-    public function testPCDATAAllowedWithEscaping()
+    public function testPCDATAAllowedJump()
     {
         $this->obj = new HTMLPurifier_ChildDef_Required('#PCDATA | b');
-        $this->config->set('Core.EscapeInvalidChildren', true);
-        $this->assertResult(
-            'Out <b>Bold text</b><img />',
-            'Out <b>Bold text</b>&lt;img /&gt;'
-        );
+        $this->assertResult('A <i>foo</i>', 'A foo');
     }
 }
 
diff --git a/tests/HTMLPurifier/ChildDef/TableTest.php b/tests/HTMLPurifier/ChildDef/TableTest.php
index 61027e71..49e96b65 100644
--- a/tests/HTMLPurifier/ChildDef/TableTest.php
+++ b/tests/HTMLPurifier/ChildDef/TableTest.php
@@ -76,7 +76,7 @@ class HTMLPurifier_ChildDef_TableTest extends HTMLPurifier_ChildDefHarness
         $this->config->set('Output.Newline', "\n");
         $this->assertResult(
           "\n\t<tbody />\n\t\t<tfoot />\n\t\t\t",
-          "\n\t\t<tfoot />\n\t<tbody />\n\t\t\t"
+          "\n\t<tfoot />\n\t\t\t<tbody />\n\t\t"
         );
 
     }
diff --git a/tests/HTMLPurifier/ChildDefHarness.php b/tests/HTMLPurifier/ChildDefHarness.php
index 26bb4c2d..d21070c9 100644
--- a/tests/HTMLPurifier/ChildDefHarness.php
+++ b/tests/HTMLPurifier/ChildDefHarness.php
@@ -8,8 +8,8 @@ class HTMLPurifier_ChildDefHarness extends HTMLPurifier_ComplexHarness
         parent::setUp();
         $this->obj       = null;
         $this->func      = 'validateChildren';
-        $this->to_tokens = true;
         $this->to_html   = true;
+        $this->to_node_list = true;
     }
 
 }
diff --git a/tests/HTMLPurifier/ComplexHarness.php b/tests/HTMLPurifier/ComplexHarness.php
index eee718a9..55c10ca9 100644
--- a/tests/HTMLPurifier/ComplexHarness.php
+++ b/tests/HTMLPurifier/ComplexHarness.php
@@ -29,6 +29,14 @@ class HTMLPurifier_ComplexHarness extends HTMLPurifier_Harness
     protected $to_tokens = false;
 
     /**
+     * Whether or not the method deals in a node list.
+     * If set to true, assertResult() will transparently convert HTML
+     * to and back from node.
+     * @type bool
+     */
+    protected $to_node_list = false;
+
+    /**
      * Whether or not to convert tokens back into HTML before performing
      * equality check, has no effect on bools.
      * @type bool
@@ -54,9 +62,12 @@ class HTMLPurifier_ComplexHarness extends HTMLPurifier_Harness
      */
     protected function assertResult($input, $expect = true)
     {
-        if ($this->to_tokens && is_string($input)) {
-            // $func may cause $input to change, so "clone" another copy
-            // to sacrifice
+        // $func may cause $input to change, so "clone" another copy
+        // to sacrifice
+        if ($this->to_node_list && is_string($input)) {
+            $input = HTMLPurifier_Arborize::arborize($this->tokenize($temp = $input), $this->config, $this->context)->children;
+            $input_c = HTMLPurifier_Arborize::arborize($this->tokenize($temp), $this->config, $this->context)->children;
+        } elseif ($this->to_tokens && is_string($input)) {
             $input   = $this->tokenize($temp = $input);
             $input_c = $this->tokenize($temp);
         } else {
@@ -76,6 +87,12 @@ class HTMLPurifier_ComplexHarness extends HTMLPurifier_Harness
         }
 
         if ($this->to_html) {
+            if ($this->to_node_list) {
+                $result = $this->generateTokens($result);
+                if (is_array($expect) && !empty($expect) && $expect[0] instanceof HTMLPurifier_Node) {
+                    $expect = $this->generateTokens($expect);
+                }
+            }
             $result = $this->generate($result);
             if (is_array($expect)) {
                 $expect = $this->generate($expect);
@@ -106,6 +123,16 @@ class HTMLPurifier_ComplexHarness extends HTMLPurifier_Harness
         return $generator->generateFromTokens($tokens);
     }
 
+    /**
+     * Generate tokens from node list
+     */
+    protected function generateTokens($children)
+    {
+        $dummy = new HTMLPurifier_Node_Element("dummy");
+        $dummy->children = $children;
+        return HTMLPurifier_Arborize::flatten($dummy, $this->context, $this->config);
+    }
+
 }
 
 // vim: et sw=4 sts=4
diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php
index ecde2c27..ace642d0 100644
--- a/tests/HTMLPurifier/Strategy/FixNestingTest.php
+++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php
@@ -27,15 +27,6 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness
         );
     }
 
-    public function testEscapeBlockInInline()
-    {
-        $this->config->set('Core.EscapeInvalidChildren', true);
-        $this->assertResult(
-            '<b><div>Illegal div.</div></b>',
-            '<b>&lt;div&gt;Illegal div.&lt;/div&gt;</b>'
-        );
-    }
-
     public function testRemoveNodeWithMissingRequiredElements()
     {
         $this->assertResult('<ul></ul>', '');
@@ -98,15 +89,6 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness
         );
     }
 
-    public function testChameleonEscapeInvalidBlockInInline()
-    {
-        $this->config->set('Core.EscapeInvalidChildren', true);
-        $this->assertResult( // alt config
-          '<span><ins><div>Not allowed!</div></ins></span>',
-          '<span><ins>&lt;div&gt;Not allowed!&lt;/div&gt;</ins></span>'
-        );
-    }
-
     public function testExclusionsIntegration()
     {
         // test exclusions
-- 
2.11.4.GIT