From 9f1e678b482793c91c25b794ffb8fe5c9756bd93 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 5 Apr 2008 04:28:37 +0000 Subject: [PATCH] [3.1.0] Fixed fatal error in PH5P lexer with invalid tag names git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1650 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 1 + library/HTMLPurifier/Lexer/DirectLex.php | 18 ++----------- library/HTMLPurifier/Lexer/PH5P.php | 17 +++++++++--- maintenance/PH5P.patch | 46 +++++++++++++++++++++++++++++--- tests/HTMLPurifier/LexerTest.php | 22 ++++++++++++++- tests/common.php | 13 +++++++++ 6 files changed, 93 insertions(+), 24 deletions(-) diff --git a/NEWS b/NEWS index c10a1897..e35599d1 100644 --- a/NEWS +++ b/NEWS @@ -55,6 +55,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier - Fix bug with rgb(0, 1, 2) color syntax with spaces inside shorthand syntax - HTMLPurifier_HTMLDefinition->addAttribute can now be called multiple times on the same element without emitting errors. +- Fixed fatal error in PH5P lexer with invalid tag names . Plugins now get their own changelogs according to project conventions. . Convert tokens to use instanceof, reducing memory footprint and improving comparison speed. diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 1d38826b..985ad680 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -63,16 +63,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $e =& $context->get('ErrorCollector'); } - // infinite loop protection - // has to be pretty big, since html docs can be big - // we're allow two hundred thousand tags... more than enough? - // NOTE: this is also used for synchronization, so watch out + // for testing synchronization $loops = 0; - while(true) { - - // infinite loop protection - if (++$loops > 200000) return array(); + while(++$loops) { // recalculate lines if ( @@ -381,16 +375,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // space, so let's guarantee that there's always a terminating space. $string .= ' '; - // infinite loop protection - $loops = 0; while(true) { - // infinite loop protection - if (++$loops > 1000) { - trigger_error('Infinite loop detected in attribute parsing', E_USER_WARNING); - return array(); - } - if ($cursor >= $size) { break; } diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php index bd068000..9fa92448 100644 --- a/library/HTMLPurifier/Lexer/PH5P.php +++ b/library/HTMLPurifier/Lexer/PH5P.php @@ -115,7 +115,7 @@ class HTML5 { public function __construct($data) { $data = str_replace("\r\n", "\n", $data); - $date = str_replace("\r", null, $data); + $data = str_replace("\r", null, $data); $this->data = $data; $this->char = -1; @@ -2143,7 +2143,7 @@ class HTML5TreeConstructer { /* Reconstruct the active formatting elements, if any. */ $this->reconstructActiveFormattingElements(); - $this->insertElement($token); + $this->insertElement($token, true, true); break; } break; @@ -3524,7 +3524,18 @@ class HTML5TreeConstructer { } } - private function insertElement($token, $append = true) { + private function insertElement($token, $append = true, $check = false) { + // Proprietary workaround for libxml2's limitations with tag names + if ($check) { + // Slightly modified HTML5 tag-name modification, + // removing anything that's not an ASCII letter, digit, or hyphen + $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); + // Remove leading hyphens and numbers + $token['name'] = ltrim($token['name'], '-0..9'); + // In theory, this should ever be needed, but just in case + if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice + } + $el = $this->dom->createElement($token['name']); foreach($token['attr'] as $attr) { diff --git a/maintenance/PH5P.patch b/maintenance/PH5P.patch index 9365cffe..bd9cd491 100644 --- a/maintenance/PH5P.patch +++ b/maintenance/PH5P.patch @@ -1,5 +1,14 @@ ---- C:\Users\Edward\Webs\htmlpurifier\maintenance\PH5P.php 2007-11-04 23:41:49.074543700 -0500 -+++ C:\Users\Edward\Webs\htmlpurifier\maintenance/PH5P.new.php 2007-11-05 00:23:52.839543700 -0500 +--- C:\Users\Edward\Webs\htmlpurifier\maintenance\PH5P.php 2007-11-05 00:01:51.643585000 -0500 ++++ C:\Users\Edward\Webs\htmlpurifier\maintenance/PH5P.new.php 2008-04-05 00:26:39.343160000 -0400 +@@ -65,7 +65,7 @@ + + public function __construct($data) { + $data = str_replace("\r\n", "\n", $data); +- $date = str_replace("\r", null, $data); ++ $data = str_replace("\r", null, $data); + + $this->data = $data; + $this->char = -1; @@ -211,7 +211,10 @@ // If nothing is returned, emit a U+0026 AMPERSAND character token. // Otherwise, emit the character token that was returned. @@ -43,7 +52,36 @@ $entity = $id; break; } -@@ -3659,7 +3668,7 @@ +@@ -2084,7 +2093,7 @@ + /* Reconstruct the active formatting elements, if any. */ + $this->reconstructActiveFormattingElements(); + +- $this->insertElement($token); ++ $this->insertElement($token, true, true); + break; + } + break; +@@ -3465,7 +3474,18 @@ + } + } + +- private function insertElement($token, $append = true) { ++ private function insertElement($token, $append = true, $check = false) { ++ // Proprietary workaround for libxml2's limitations with tag names ++ if ($check) { ++ // Slightly modified HTML5 tag-name modification, ++ // removing anything that's not an ASCII letter, digit, or hyphen ++ $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); ++ // Remove leading hyphens and numbers ++ $token['name'] = ltrim($token['name'], '-0..9'); ++ // In theory, this should ever be needed, but just in case ++ if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice ++ } ++ + $el = $this->dom->createElement($token['name']); + + foreach($token['attr'] as $attr) { +@@ -3659,7 +3679,7 @@ } } @@ -52,7 +90,7 @@ /* When the steps below require the UA to generate implied end tags, then, if the current node is a dd element, a dt element, an li element, a p element, a td element, a th element, or a tr element, the UA must -@@ -3673,7 +3682,8 @@ +@@ -3673,7 +3693,8 @@ } } diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 78e5a056..6425ca35 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -509,7 +509,7 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness ); } - function test_tokenizeHTML_() { + function test_tokenizeHTML_style() { $extra = array( // PH5P doesn't seem to like style tags 'PH5P' => false, @@ -543,6 +543,26 @@ div {} ); } + function test_tokenizeHTML_() { + $this->assertTokenization( + '>', + array( + new HTMLPurifier_Token_Start('a'), + new HTMLPurifier_Token_Text('>'), + new HTMLPurifier_Token_End('a'), + ), + array( + 'DirectLex' => array( + // Technically this is invalid, but it won't be a + // problem with invalid element removal; also, this + // mimics Mozilla's parsing of the tag. + new HTMLPurifier_Token_Start('a@'), + new HTMLPurifier_Token_Text('>'), + ), + ) + ); + } + /* function test_tokenizeHTML_() { diff --git a/tests/common.php b/tests/common.php index 05192455..85f2dc70 100644 --- a/tests/common.php +++ b/tests/common.php @@ -209,3 +209,16 @@ function htmlpurifier_flush($php, $reporter) { exit(1); } } + +/** + * Dumps error queue, useful if there has been a fatal error. + */ +function htmlpurifier_dump_error_queue() { + $context = &SimpleTest::getContext(); + $queue = &$context->get('SimpleErrorQueue'); + if ($queue && !empty($queue->_queue)) { + // replace this with something prettier + var_dump($queue->_queue); + } +} +register_shutdown_function('htmlpurifier_dump_error_queue'); -- 2.11.4.GIT