From 7e11c271b97560525407ac0ca001f95fe77f0269 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Tue, 7 Mar 2017 13:34:55 -0800 Subject: [PATCH] Revamp entity decoding to be more like HTML5. See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang --- NEWS | 4 + configdoc/usage.xml | 14 +-- library/HTMLPurifier/ConfigSchema/schema.ser | Bin 15692 -> 15800 bytes .../schema/Core.LegacyEntityDecoder.txt | 36 ++++++ library/HTMLPurifier/EntityParser.php | 134 ++++++++++++++++++++- library/HTMLPurifier/Lexer.php | 29 +++-- library/HTMLPurifier/Lexer/DOMLex.php | 12 +- library/HTMLPurifier/Lexer/DirectLex.php | 16 +-- library/HTMLPurifier/Lexer/PH5P.php | 5 +- tests/HTMLPurifier/LexerTest.php | 57 ++++++++- 10 files changed, 272 insertions(+), 35 deletions(-) rewrite library/HTMLPurifier/ConfigSchema/schema.ser (84%) create mode 100644 library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt diff --git a/NEWS b/NEWS index 9485d40d..beef6b20 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier rest of the text in case it ran off the end. (#78) - Fix PREG_BACKTRACK_LIMIT_ERROR in HTMLPurifier_Filter_ExtractStyle. Thanks @breathbath for contributing the report and fix (#120) +- Fix entity decoding algorithm to be more conservative about + decoding entities that are missing trailing semicolon. + To get old behavior, set %Core.LegacyEntityDecoder to true. + (#119) # By default, when a link has a target attribute associated with it, we now also add rel="noopener" in order to prevent the new window from being able to overwrite diff --git a/configdoc/usage.xml b/configdoc/usage.xml index 5028b0eb..49bddaa5 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -6,7 +6,7 @@ 85 - 315 + 322 67 @@ -124,7 +124,7 @@ 122 - 297 + 304 @@ -172,7 +172,7 @@ 234 - 302 + 309 37 @@ -262,12 +262,12 @@ - 313 + 320 - 334 + 343 @@ -444,12 +444,12 @@ - 122 + 125 - 327 + 330 diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser dissimilarity index 84% index 40b870b38b1357d5b4243b2843ca1d3630ffd171..df8c5c466d9298cbb8fc0bce8676452324736ef4 100644 GIT binary patch delta 156 zcwReGwWE51IitzuT;VmGibf_@O3wL3sd_%C>50jeu6ZSyC6z9z$@wX%MM~C5RtB3N pa&~i)qS~2ra-^Ceqxs}qVcE?q)OZQ$%Cs_{{E$<8^GW??UI2=nHI)DW delta 50 ycwXC4eWq%HIivCBiNb3*Hz&$Ea8C9R_CW9pIT_6+PZXBj{6USEbF-qsbY1|~>=7;i diff --git a/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt b/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt new file mode 100644 index 00000000..392b4364 --- /dev/null +++ b/library/HTMLPurifier/ConfigSchema/schema/Core.LegacyEntityDecoder.txt @@ -0,0 +1,36 @@ +Core.LegacyEntityDecoder +TYPE: bool +VERSION: 4.9.0 +DEFAULT: false +--DESCRIPTION-- +

+ Prior to HTML Purifier 4.9.0, entities were decoded by performing + a global search replace for all entities whose decoded versions + did not have special meanings under HTML, and replaced them with + their decoded versions. We would match all entities, even if they did + not have a trailing semicolon, but only if there weren't any trailing + alphanumeric characters. +

+ + + + + + +

Original	Text	Attribute
¥	¥	¥
&yen	¥	¥
&yena	&yena	&yena
&yen=	¥=	¥=

+ In HTML Purifier 4.9.0, we changed the behavior of entity parsing + to match entities that had missing trailing semicolons in less + cases, to more closely match HTML5 parsing behavior: +

+ + + + + + +

Original	Text	Attribute
¥	¥	¥
&yen	¥	¥
&yena	¥a	&yena
&yen=	¥=	&yen=

+ This flag reverts back to pre-HTML Purifier 4.9.0 behavior. +

+--# vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/EntityParser.php b/library/HTMLPurifier/EntityParser.php index 61529dcd..50e07bb2 100644 --- a/library/HTMLPurifier/EntityParser.php +++ b/library/HTMLPurifier/EntityParser.php @@ -17,6 +17,138 @@ class HTMLPurifier_EntityParser protected $_entity_lookup; /** + * Callback regex string for entities in text. + * @type string + */ + protected $_textEntitiesRegex; + + /** + * Callback regex string for entities in attributes. + * @type string + */ + protected $_attrEntitiesRegex; + + /** + * Tests if the beginning of a string is a semi-optional regex + */ + protected $_semiOptionalPrefixRegex; + + public function __construct() { + // From + // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon + $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml"; + + // NB: three empty captures to put the fourth match in the right + // place + $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/"; + + $this->_textEntitiesRegex = + '/&(?:'. + // hex + '[#]x([a-fA-F0-9]+);?|'. + // dec + '[#]0*(\d+);?|'. + // string (mandatory semicolon) + // NB: order matters: match semicolon preferentially + '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. + // string (optional semicolon) + "($semi_optional)". + ')/'; + + $this->_attrEntitiesRegex = + '/&(?:'. + // hex + '[#]x([a-fA-F0-9]+);?|'. + // dec + '[#]0*(\d+);?|'. + // string (mandatory semicolon) + // NB: order matters: match semicolon preferentially + '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. + // string (optional semicolon) + // don't match if trailing is equals or alphanumeric (URL + // like) + "($semi_optional)(?![=;A-Za-z0-9])". + ')/'; + + } + + /** + * Substitute entities with the parsed equivalents. Use this on + * textual data in an HTML document (as opposed to attributes.) + * + * @param string $string String to have entities parsed. + * @return string Parsed string. + */ + public function substituteTextEntities($string) + { + return preg_replace_callback( + $this->_textEntitiesRegex, + array($this, 'entityCallback'), + $string + ); + } + + /** + * Substitute entities with the parsed equivalents. Use this on + * attribute contents in documents. + * + * @param string $string String to have entities parsed. + * @return string Parsed string. + */ + public function substituteAttrEntities($string) + { + return preg_replace_callback( + $this->_attrEntitiesRegex, + array($this, 'entityCallback'), + $string + ); + } + + /** + * Callback function for substituteNonSpecialEntities() that does the work. + * + * @param array $matches PCRE matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @return string Replacement string. + */ + + protected function entityCallback($matches) + { + $entity = $matches[0]; + $hex_part = @$matches[1]; + $dec_part = @$matches[2]; + $named_part = empty($matches[3]) ? @$matches[4] : $matches[3]; + if ($hex_part) { + return HTMLPurifier_Encoder::unichr(hexdec($hex_part)); + } elseif ($dec_part) { + return HTMLPurifier_Encoder((int) $dec_part); + } else { + if (!$this->_entity_lookup) { + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); + } + if (isset($this->_entity_lookup->table[$named_part])) { + return $this->_entity_lookup->table[$named_part]; + } else { + // exact match didn't match anything, so test if + // any of the semicolon optional match the prefix. + // Test that this is an EXACT match is important to + // prevent infinite loop + if (!empty($matches[3])) { + return preg_replace_callback( + $this->_semiOptionalPrefixRegex, + array($this, 'entityCallback'), + $entity + ); + } + return $entity; + } + } + } + + // LEGACY CODE BELOW + + /** * Callback regex string for parsing entities. * @type string */ @@ -144,7 +276,7 @@ class HTMLPurifier_EntityParser $entity; } else { return isset($this->_special_ent2dec[$matches[3]]) ? - $this->_special_ent2dec[$matches[3]] : + $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] : $entity; } } diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 44c5c659..37174eae 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -169,21 +169,24 @@ class HTMLPurifier_Lexer ''' => "'" ); + public function parseText($string, $config) { + return $this->parseData($string, false, $config); + } + + public function parseAttr($string, $config) { + return $this->parseData($string, true, $config); + } + /** * Parses special entities into the proper characters. * * This string will translate escaped versions of the special characters * into the correct ones. * - * @warning - * You should be able to treat the output of this function as - * completely parsed, but that's only because all other entities should - * have been handled previously in substituteNonSpecialEntities() - * * @param string $string String character data to be parsed. * @return string Parsed character data. */ - public function parseData($string) + public function parseData($string, $is_attr, $config) { // following functions require at least one character if ($string === '') { @@ -209,7 +212,15 @@ class HTMLPurifier_Lexer } // hmm... now we have some uncommon entities. Use the callback. - $string = $this->_entity_parser->substituteSpecialEntities($string); + if ($config->get('Core.LegacyEntityDecoder')) { + $string = $this->_entity_parser->substituteSpecialEntities($string); + } else { + if ($is_attr) { + $string = $this->_entity_parser->substituteAttrEntities($string); + } else { + $string = $this->_entity_parser->substituteTextEntities($string); + } + } return $string; } @@ -323,7 +334,9 @@ class HTMLPurifier_Lexer } // expand entities that aren't the big five - $html = $this->_entity_parser->substituteNonSpecialEntities($html); + if ($config->get('Core.LegacyEntityDecoder')) { + $html = $this->_entity_parser->substituteNonSpecialEntities($html); + } // clean into wellformed UTF-8 string for an SGML context: this has // to be done after entity expansion because the entities sometimes diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 1406c506..22ab5820 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $div = $body->getElementsByTagName('div')->item(0); //

$tokens = array(); - $this->tokenizeDOM($div, $tokens); + $this->tokenizeDOM($div, $tokens, $config); // If the div has a sibling, that means we tripped across // a premature

tag. So remove the div we parsed, // and then tokenize the rest of body. We can't tokenize // the sibling directly as we'll lose the tags in that case. if ($div->nextSibling) { $body->removeChild($div); - $this->tokenizeDOM($body, $tokens); + $this->tokenizeDOM($body, $tokens, $config); } return $tokens; } @@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. * @return HTMLPurifier_Token of node appended to previously passed tokens. */ - protected function tokenizeDOM($node, &$tokens) + protected function tokenizeDOM($node, &$tokens, $config) { $level = 0; $nodes = array($level => new HTMLPurifier_Queue(array($node))); @@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer while (!$nodes[$level]->isEmpty()) { $node = $nodes[$level]->shift(); // FIFO $collect = $level > 0 ? true : false; - $needEndingTag = $this->createStartNode($node, $tokens, $collect); + $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config); if ($needEndingTag) { $closingNodes[$level][] = $node; } @@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer * @return bool if the token needs an endtoken * @todo data and tagName properties don't seem to exist in DOMNode? */ - protected function createStartNode($node, &$tokens, $collect) + protected function createStartNode($node, &$tokens, $collect, $config) { // intercept non element nodes. WE MUST catch all of them, // but we're not getting the character reference nodes because @@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } } } - $tokens[] = $this->factory->createText($this->parseData($data)); + $tokens[] = $this->factory->createText($this->parseText($data, $config)); return false; } elseif ($node->nodeType === XML_COMMENT_NODE) { // this is code is only invoked for comments in script/style in versions diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 746b6e31..6f130896 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // We are not inside tag and there still is another tag to parse $token = new HTMLPurifier_Token_Text( - $this->parseData( + $this->parseText( substr( $html, $cursor, $position_next_lt - $cursor - ) + ), $config ) ); if ($maintain_line_numbers) { @@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // Create Text of rest of string $token = new HTMLPurifier_Token_Text( - $this->parseData( + $this->parseText( substr( $html, $cursor - ) + ), $config ) ); if ($maintain_line_numbers) { @@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $token = new HTMLPurifier_Token_Text( '<' . - $this->parseData( - substr($html, $cursor) + $this->parseText( + substr($html, $cursor), $config ) ); if ($maintain_line_numbers) { @@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($value === false) { $value = ''; } - return array($key => $this->parseData($value)); + return array($key => $this->parseAttr($value, $config)); } // setup loop environment @@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($value === false) { $value = ''; } - $array[$key] = $this->parseData($value); + $array[$key] = $this->parseAttr($value, $config); $cursor++; } else { // boolattr diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php index 39a677da..0b452d17 100644 --- a/library/HTMLPurifier/Lexer/PH5P.php +++ b/library/HTMLPurifier/Lexer/PH5P.php @@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex $doc->getElementsByTagName('html')->item(0)-> // getElementsByTagName('body')->item(0) // , - $tokens + $tokens, $config ); return $tokens; } @@ -1515,6 +1515,7 @@ class HTML5 // Consume the maximum number of characters possible, with the // consumed characters case-sensitively matching one of the // identifiers in the first column of the entities table. + $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); $len = strlen($e_name); @@ -1547,7 +1548,7 @@ class HTML5 // Return a character token for the character corresponding to the // entity name (as given by the second column of the entities table). - return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8'); + return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8'); } private function emitToken($token) diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index e28dc9e9..12b46dd4 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -46,11 +46,11 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness // HTMLPurifier_Lexer->parseData() ----------------------------------------- - public function assertParseData($input, $expect = true) + public function assertParseData($input, $expect = true, $is_attr = false) { if ($expect === true) $expect = $input; $lexer = new HTMLPurifier_Lexer(); - $this->assertIdentical($expect, $lexer->parseData($input)); + $this->assertIdentical($expect, $lexer->parseData($input, $is_attr, $this->config)); } public function test_parseData_plainText() @@ -95,7 +95,58 @@ class HTMLPurifier_LexerTest extends HTMLPurifier_Harness public function test_parseData_improperEntityFaultToleranceTest() { - $this->assertParseData('-'); + $this->assertParseData('-', '-'); + } + + public function test_parseData_noTrailingSemi() + { + $this->assertParseData('&A', '&A'); + } + + public function test_parseData_noTrailingSemiAttr() + { + $this->assertParseData('&A', '&A', true); + } + + public function test_parseData_T119() + { + $this->assertParseData('&A', '&A', true); + } + + public function test_parseData_T119b() + { + $this->assertParseData('&trade=', true, true); + } + + public function test_parseData_legacy1() + { + $this->config->set('Core.LegacyEntityDecoder', true); + $this->assertParseData('&a', true); + $this->assertParseData('&=', "&="); + $this->assertParseData('&a', true, true); + $this->assertParseData('&=', "&=", true); + $this->assertParseData('<a', true); + $this->assertParseData('<=', "<="); + $this->assertParseData('<a', true, true); + $this->assertParseData('<=', "<=", true); + } + + public function test_parseData_nonlegacy1() + { + $this->assertParseData('&a', "&a"); + $this->assertParseData('&=', "&="); + $this->assertParseData('&a', true, true); + $this->assertParseData('&=', true, true); + $this->assertParseData('<a', "assertParseData('<=', "<="); + $this->assertParseData('<a', true, true); + $this->assertParseData('<=', true, true); + $this->assertParseData('<a;', "assertParseData('&imath'); } // HTMLPurifier_Lexer->extractBody() --------------------------------------- -- 2.11.4.GIT