From e41af46a8b93e90991c0e317b0756404b0b775e0 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 26 Dec 2011 14:49:26 +0800 Subject: [PATCH] Fix broken table content model, easily seen in XHTML1.1 Signed-off-by: Edward Z. Yang --- NEWS | 2 + TODO | 4 ++ library/HTMLPurifier/ChildDef/Table.php | 95 +++++++++++++++++++++++++++++-- tests/HTMLPurifier/ChildDef/TableTest.php | 16 +++++- 4 files changed, 111 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index 242f0688..b90be793 100644 --- a/NEWS +++ b/NEWS @@ -29,6 +29,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier problems. - Fix iconv truncation bug, where non-UTF-8 target encodings see output truncated after around 8000 characters. +- Fix broken table content model for XHTML1.1 (and also earlier + versions, although the W3C validator doesn't catch those violations) 4.3.0, released 2011-03-27 # Fixed broken caching of customized raw definitions, but requires an diff --git a/TODO b/TODO index 32a778b5..820159c3 100644 --- a/TODO +++ b/TODO @@ -112,6 +112,10 @@ Neat feature related 3. Extend the tag exclusion system to specify whether or not the contents should be dropped or not (currently, there's code that could do something like this if it didn't drop the inner text too.) + ? Make AutoParagraph also support paragraph-izing double
tags, and not + just double newlines. This is kind of tough to do in the current framework, + though, and might be reasonably approximated by search replacing double
s + with newlines before running it through HTML Purifier. Maintenance related (slightly boring) # CHMOD install script for PEAR installs diff --git a/library/HTMLPurifier/ChildDef/Table.php b/library/HTMLPurifier/ChildDef/Table.php index 34f0227d..9a93421a 100644 --- a/library/HTMLPurifier/ChildDef/Table.php +++ b/library/HTMLPurifier/ChildDef/Table.php @@ -1,7 +1,33 @@ s with a . foreach ($tokens_of_children as $token) { $is_child = ($nesting == 0); @@ -51,8 +79,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef // okay, let's stash the tokens away // first token tells us the type of the collection switch ($collection[$tag_index]->name) { - case 'tr': case 'tbody': + $tbody_mode = true; + case 'tr': $content[] = $collection; break; case 'caption': @@ -61,13 +90,28 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef break; case 'thead': case 'tfoot': + $tbody_mode = true; + // XXX This breaks rendering properties with + // Firefox, which never floats a to + // the top. Ever. (Our scheme will float the + // first to the top.) So maybe + // s that are not first should be + // turned into ? Very tricky, indeed. + // access the appropriate variable, $thead or $tfoot $var = $collection[$tag_index]->name; if ($$var === false) { $$var = $collection; } else { - // transmutate the first and less entries into - // tbody tags, and then put into content + // Oops, there's a second one! What + // should we do? Current behavior is to + // transmutate the first and last entries into + // tbody tags, and then put into content. + // Maybe a better idea is to *attach + // it* to the existing thead or tfoot? + // We don't do this, because Firefox + // doesn't float an extra tfoot to the + // bottom like it does for the first one. $collection[$tag_index]->name = 'tbody'; $collection[count($collection)-1]->name = 'tbody'; $content[] = $collection; @@ -126,7 +170,48 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array); if ($thead !== false) $ret = array_merge($ret, $thead); if ($tfoot !== false) $ret = array_merge($ret, $tfoot); - foreach ($content as $token_array) $ret = array_merge($ret, $token_array); + + if ($tbody_mode) { + // a little tricky, since the start of the collection may be + // whitespace + $inside_tbody = false; + foreach ($content as $token_array) { + // find the starting token + foreach ($token_array as $t) { + if ($t->name === 'tr' || $t->name === 'tbody') { + break; + } + } // iterator variable carries over + if ($t->name === 'tr') { + if ($inside_tbody) { + $ret = array_merge($ret, $token_array); + } else { + $ret[] = new HTMLPurifier_Token_Start('tbody'); + $ret = array_merge($ret, $token_array); + $inside_tbody = true; + } + } elseif ($t->name === 'tbody') { + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + $inside_tbody = false; + $ret = array_merge($ret, $token_array); + } else { + $ret = array_merge($ret, $token_array); + } + } else { + trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR); + } + } + if ($inside_tbody) { + $ret[] = new HTMLPurifier_Token_End('tbody'); + } + } else { + foreach ($content as $token_array) { + // invariant: everything in here is s + $ret = array_merge($ret, $token_array); + } + } + if (!empty($collection) && $is_collecting == false){ // grab the trailing space $ret = array_merge($ret, $collection); diff --git a/tests/HTMLPurifier/ChildDef/TableTest.php b/tests/HTMLPurifier/ChildDef/TableTest.php index 034e025b..2f72d187 100644 --- a/tests/HTMLPurifier/ChildDef/TableTest.php +++ b/tests/HTMLPurifier/ChildDef/TableTest.php @@ -28,7 +28,21 @@ class HTMLPurifier_ChildDef_TableTest extends HTMLPurifier_ChildDefHarness function testReorderContents() { $this->assertResult( '1', - '1'); + '1'); + } + + function testXhtml11Illegal() { + $this->assertResult( + 'aa', + 'aa' + ); + } + + function testTrOverflowAndClose() { + $this->assertResult( + 'abcd', + 'abcd' + ); } function testDuplicateProcessing() { -- 2.11.4.GIT