From cb5d5d0648a93e2a8043a10aa790cef2c6d87ad2 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 14 May 2008 02:19:00 +0000 Subject: [PATCH] [3.1.0] Revamp URI handling of percent encoding and validation. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1709 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 2 + TODO | 3 ++ configdoc/usage.xml | 4 +- library/HTMLPurifier/AttrDef/URI.php | 12 +---- library/HTMLPurifier/AttrDef/URI/Host.php | 20 +++++++-- library/HTMLPurifier/PercentEncoder.php | 65 ++++++++++++++++++++++++--- library/HTMLPurifier/URI.php | 70 +++++++++++++++++++++++++---- library/HTMLPurifier/URIParser.php | 41 ++++++++++------- tests/HTMLPurifier/AttrDef/URI/HostTest.php | 21 +++++++++ tests/HTMLPurifier/AttrDef/URITest.php | 13 ++++++ tests/HTMLPurifier/PercentEncoderTest.php | 23 ++++++++++ tests/HTMLPurifier/URIParserTest.php | 9 +++- tests/HTMLPurifier/URITest.php | 28 ++++++++++++ 13 files changed, 261 insertions(+), 50 deletions(-) diff --git a/NEWS b/NEWS index 8f9d14cc..39bdadcd 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! Commas, not dashes, used for serializer IDs. This change is forwards-compatible and allows for version numbers like "3.1.0-dev". ! %HTML.Allowed deals gracefully with whitespace anywhere, anytime! +! HTML Purifier's URI handling is a lot more robust, with much stricter + validation checks and better percent encoding handling. - InterchangeBuilder now alphabetizes its lists - Validation error in configdoc output fixed - Iconv and other encoding errors muted even with custom error handlers that diff --git a/TODO b/TODO index eca4768c..e19a2ed5 100644 --- a/TODO +++ b/TODO @@ -11,6 +11,8 @@ If no interest is expressed for a feature that may require a considerable amount of effort to implement, it may get endlessly delayed. Do not be afraid to cast your vote for the next feature to be implemented! +- Implement validation for query and for fragment + FUTURE VERSIONS --------------- @@ -47,6 +49,7 @@ FUTURE VERSIONS AttrDef class). Probably will use CSSTidy class? # More control over allowed CSS properties using a modularization # HTML 5 support + # IRI support - Standardize token armor for all areas of processing - Convert RTL/LTR override characters to tags, or vice versa on demand. Also, enable disabling of directionality diff --git a/configdoc/usage.xml b/configdoc/usage.xml index 0d7346b3..0929548d 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -215,12 +215,12 @@ - 24 + 23 - 78 + 68 diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index 5afc1c1b..9c632f1a 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -7,7 +7,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef { - protected $parser, $percentEncoder; + protected $parser; protected $embedsResource; /** @@ -15,7 +15,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef */ public function __construct($embeds_resource = false) { $this->parser = new HTMLPurifier_URIParser(); - $this->percentEncoder = new HTMLPurifier_PercentEncoder(); $this->embedsResource = (bool) $embeds_resource; } @@ -23,9 +22,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef if ($config->get('URI', 'Disable')) return false; - // initial operations $uri = $this->parseCDATA($uri); - $uri = $this->percentEncoder->normalize($uri); // parse the URI $uri = $this->parser->parse($uri); @@ -61,13 +58,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef $context->destroy('EmbeddedURI'); if (!$ok) return false; - // munge scheme off if necessary (this must be last) - if (!is_null($uri->scheme) && is_null($uri->host)) { - if ($uri_def->defaultScheme == $uri->scheme) { - $uri->scheme = null; - } - } - // back to string $result = $uri->toString(); diff --git a/library/HTMLPurifier/AttrDef/URI/Host.php b/library/HTMLPurifier/AttrDef/URI/Host.php index 51c1b8a5..8d1a7b2c 100644 --- a/library/HTMLPurifier/AttrDef/URI/Host.php +++ b/library/HTMLPurifier/AttrDef/URI/Host.php @@ -36,11 +36,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef $ipv4 = $this->ipv4->validate($string, $config, $context); if ($ipv4 !== false) return $ipv4; - // validate a domain name here, do filtering, etc etc etc + // A regular domain name. - // We could use this, but it would break I18N domain names - //$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string); - //if (!$match) return false; + // This breaks I18N domain names, but we don't have proper IRI support, + // so force users to insert Punycode. If there's complaining we'll + // try to fix things into an international friendly form. + + // The productions describing this are: + $a = '[a-z]'; // alpha + $an = '[a-z0-9]'; // alphanum + $and = '[a-z0-9-]'; // alphanum | "-" + // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum + $domainlabel = "$an($and*$an)?"; + // toplabel = alpha | alpha *( alphanum | "-" ) alphanum + $toplabel = "$a($and*$an)?"; + // hostname = *( domainlabel "." ) toplabel [ "." ] + $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string); + if (!$match) return false; return $string; } diff --git a/library/HTMLPurifier/PercentEncoder.php b/library/HTMLPurifier/PercentEncoder.php index 9ce3a1f1..e1358675 100644 --- a/library/HTMLPurifier/PercentEncoder.php +++ b/library/HTMLPurifier/PercentEncoder.php @@ -2,12 +2,68 @@ /** * Class that handles operations involving percent-encoding in URIs. + * + * @warning + * Be careful when reusing instances of PercentEncoder. The object + * you use for normalize() SHOULD NOT be used for encode(), or + * vice-versa. */ class HTMLPurifier_PercentEncoder { /** - * Fix up percent-encoding by decoding unreserved characters and normalizing + * Reserved characters to preserve when using encode(). + */ + protected $preserve = array(); + + /** + * String of characters that should be preserved while using encode(). + */ + public function __construct($preserve = false) { + // unreserved letters, ought to const-ify + for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits + for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case + for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case + $this->preserve[45] = true; // Dash - + $this->preserve[46] = true; // Period . + $this->preserve[95] = true; // Underscore _ + $this->preserve[126]= true; // Tilde ~ + + // extra letters not to escape + if ($preserve !== false) { + for ($i = 0, $c = strlen($preserve); $i < $c; $i++) { + $this->preserve[ord($preserve[$i])] = true; + } + } + } + + /** + * Our replacement for urlencode, it encodes all non-reserved characters, + * as well as any extra characters that were instructed to be preserved. + * @note + * Assumes that the string has already been normalized, making any + * and all percent escape sequences valid. Percents will not be + * re-escaped, regardless of their status in $preserve + * @param $string String to be encoded + * @return Encoded string. + */ + public function encode($string) { + $ret = ''; + for ($i = 0, $c = strlen($string); $i < $c; $i++) { + if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) { + $ret .= '%' . sprintf('%02X', $int); + } else { + $ret .= $string[$i]; + } + } + return $ret; + } + + /** + * Fix up percent-encoding by decoding unreserved characters and normalizing. + * @warning This function is affected by $preserve, even though the + * usual desired behavior is for this not to preserve those + * characters. Be careful when reusing instances of PercentEncoder! * @param $string String to normalize */ public function normalize($string) { @@ -27,12 +83,7 @@ class HTMLPurifier_PercentEncoder continue; } $int = hexdec($encoding); - if ( - ($int >= 48 && $int <= 57) || // digits - ($int >= 65 && $int <= 90) || // uppercase letters - ($int >= 97 && $int <= 122) || // lowercase letters - $int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._ - ) { + if (isset($this->preserve[$int])) { $ret .= chr($int) . $text; continue; } diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php index d63b324e..43f1b192 100644 --- a/library/HTMLPurifier/URI.php +++ b/library/HTMLPurifier/URI.php @@ -1,7 +1,12 @@ scheme) && is_null($this->host)) { + $def = $config->getDefinition('URI'); + if ($def->defaultScheme === $this->scheme) { + $this->scheme = null; + } + } + // validate host if (!is_null($this->host)) { $host_def = new HTMLPurifier_AttrDef_URI_Host(); @@ -63,18 +82,51 @@ class HTMLPurifier_URI if ($this->host === false) $this->host = null; } + // validate username + if (!is_null($this->userinfo)) { + $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':'); + $this->userinfo = $encoder->encode($this->userinfo); + } + // validate port if (!is_null($this->port)) { if ($this->port < 1 || $this->port > 65535) $this->port = null; } - // query and fragment are quite simple in terms of definition: - // *( pchar / "/" / "?" ), so define their validation routines - // when we start fixing percent encoding - - // path gets to be validated against a hodge-podge of rules depending - // on the status of authority and scheme, but it's not that important, - // esp. since it won't be applicable to everyone + // validate path + $path_parts = array(); + $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/'); + if (!is_null($this->host)) { + // path-abempty (hier and relative) + $this->path = $segments_encoder->encode($this->path); + } elseif ($this->path !== '' && $this->path[0] === '/') { + // path-absolute (hier and relative) + if (strlen($this->path) >= 2 && $this->path[1] === '/') { + // This shouldn't ever happen! + $this->path = ''; + } else { + $this->path = $segments_encoder->encode($this->path); + } + } elseif (!is_null($this->scheme) && $this->path !== '') { + // path-rootless (hier) + // Short circuit evaluation means we don't need to check nz + $this->path = $segments_encoder->encode($this->path); + } elseif (is_null($this->scheme) && $this->path !== '') { + // path-noscheme (relative) + // (once again, not checking nz) + $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@'); + $c = strpos($this->path, '/'); + if ($c !== false) { + $this->path = + $segment_nc_encoder->encode(substr($this->path, 0, $c)) . + $segments_encoder->encode(substr($this->path, $c)); + } else { + $this->path = $segment_nc_encoder->encode($this->path); + } + } else { + // path-empty (hier and relative) + $this->path = ''; // just to be safe + } return true; diff --git a/library/HTMLPurifier/URIParser.php b/library/HTMLPurifier/URIParser.php index a52e5042..0ea51762 100644 --- a/library/HTMLPurifier/URIParser.php +++ b/library/HTMLPurifier/URIParser.php @@ -2,24 +2,39 @@ /** * Parses a URI into the components and fragment identifier as specified - * by RFC 2396. - * @todo Replace regexps with a native PHP parser + * by RFC 3986. */ class HTMLPurifier_URIParser { /** - * Parses a URI + * Instance of HTMLPurifier_PercentEncoder to do normalization with. + */ + protected $percentEncoder; + + public function __construct() { + $this->percentEncoder = new HTMLPurifier_PercentEncoder(); + } + + /** + * Parses a URI. * @param $uri string URI to parse - * @return HTMLPurifier_URI representation of URI + * @return HTMLPurifier_URI representation of URI. This representation has + * not been validated yet and may not conform to RFC. */ public function parse($uri) { + + $uri = $this->percentEncoder->normalize($uri); + + // Regexp is as per Appendix B. + // Note that ["<>] are an addition to the RFC's recommended + // characters, because they represent external delimeters. $r_URI = '!'. - '(([^:/?#<>\'"]+):)?'. // 2. Scheme - '(//([^/?#<>\'"]*))?'. // 4. Authority - '([^?#<>\'"]*)'. // 5. Path - '(\?([^#<>\'"]*))?'. // 7. Query - '(#([^<>\'"]*))?'. // 8. Fragment + '(([^:/?#"<>]+):)?'. // 2. Scheme + '(//([^/?#"<>]*))?'. // 4. Authority + '([^?#"<>]*)'. // 5. Path + '(\?([^#"<>]*))?'. // 7. Query + '(#([^"<>]*))?'. // 8. Fragment '!'; $matches = array(); @@ -36,13 +51,7 @@ class HTMLPurifier_URIParser // further parse authority if ($authority !== null) { - // ridiculously inefficient: it's a stacked regex! - $HEXDIG = '[A-Fa-f0-9]'; - $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] - $sub_delims = '!$&\'()'; // needs [] - $pct_encoded = "%$HEXDIG$HEXDIG"; - $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; - $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; + $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/"; $matches = array(); preg_match($r_authority, $authority, $matches); $userinfo = !empty($matches[1]) ? $matches[2] : null; diff --git a/tests/HTMLPurifier/AttrDef/URI/HostTest.php b/tests/HTMLPurifier/AttrDef/URI/HostTest.php index b05276da..4106685d 100644 --- a/tests/HTMLPurifier/AttrDef/URI/HostTest.php +++ b/tests/HTMLPurifier/AttrDef/URI/HostTest.php @@ -14,6 +14,27 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness $this->assertDef('124.15.6.89'); // IPv4 $this->assertDef('www.google.com'); // reg-name + // more domain name tests + $this->assertDef('test.'); + $this->assertDef('sub.test.'); + $this->assertDef('.test', false); + $this->assertDef('ff'); + $this->assertDef('1f', false); + $this->assertDef('-f', false); + $this->assertDef('f1'); + $this->assertDef('f-', false); + $this->assertDef('sub.ff'); + $this->assertDef('sub.1f', false); + $this->assertDef('sub.-f', false); + $this->assertDef('sub.f1'); + $this->assertDef('sub.f-', false); + $this->assertDef('ff.top'); + $this->assertDef('1f.top'); + $this->assertDef('-f.top', false); + $this->assertDef('ff.top'); + $this->assertDef('f1.top'); + $this->assertDef('f-.top', false); + } } diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index 026d5d49..a22adaab 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -29,6 +29,19 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness ); } + function testPercentEncoding() { + $this->assertDef( + 'http:colon:mercenary', + 'colon%3Amercenary' + ); + } + + function testPercentEncodingPreserve() { + $this->assertDef( + 'http://www.example.com/abcABC123-_.!~*()\'' + ); + } + function testEmbeds() { $this->def = new HTMLPurifier_AttrDef_URI(true); $this->assertDef('http://sub.example.com/alas?foo=asd'); diff --git a/tests/HTMLPurifier/PercentEncoderTest.php b/tests/HTMLPurifier/PercentEncoderTest.php index b77875fc..6df8eb7a 100644 --- a/tests/HTMLPurifier/PercentEncoderTest.php +++ b/tests/HTMLPurifier/PercentEncoderTest.php @@ -35,5 +35,28 @@ class HTMLPurifier_PercentEncoderTest extends HTMLPurifier_Harness } + function assertEncode($string, $expect = true, $preserve = false) { + if ($expect === true) $expect = $string; + $encoder = new HTMLPurifier_PercentEncoder($preserve); + $result = $encoder->encode($string); + $this->assertIdentical($result, $expect); + } + + function test_encode_noChange() { + $this->assertEncode('abc012-_~.'); + } + + function test_encode_encode() { + $this->assertEncode('>', '%3E'); + } + + function test_encode_preserve() { + $this->assertEncode('<>', '<%3E', '<'); + } + + function test_encode_low() { + $this->assertEncode("\1", '%01'); + } + } diff --git a/tests/HTMLPurifier/URIParserTest.php b/tests/HTMLPurifier/URIParserTest.php index 97c6a2d4..ed3aa499 100644 --- a/tests/HTMLPurifier/URIParserTest.php +++ b/tests/HTMLPurifier/URIParserTest.php @@ -13,6 +13,13 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness $this->assertEqual($result, $expect); } + function testPercentNormalization() { + $this->assertParsing( + '%G', + null, null, null, null, '%25G', null, null + ); + } + function testRegular() { $this->assertParsing( 'http://www.example.com/webhp?q=foo#result2', @@ -121,7 +128,7 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness function testMalformedTag() { $this->assertParsing( - 'http://www.example.com/\'>"', + 'http://www.example.com/>', 'http', null, 'www.example.com', null, '/', null, null ); } diff --git a/tests/HTMLPurifier/URITest.php b/tests/HTMLPurifier/URITest.php index 0d916045..97d77927 100644 --- a/tests/HTMLPurifier/URITest.php +++ b/tests/HTMLPurifier/URITest.php @@ -160,4 +160,32 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:'); } + function test_validate_removeRedundantScheme() { + $this->assertValidation('http:foo:/:', 'foo%3A/:'); + } + + function test_validate_username() { + $this->assertValidation("http://user\xE3\x91\x94:@foo.com", 'http://user%E3%91%94:@foo.com'); + } + + function test_validate_path_abempty() { + $this->assertValidation("http://host/\xE3\x91\x94:", 'http://host/%E3%91%94:'); + } + + function test_validate_path_absolute() { + $this->assertValidation("/\xE3\x91\x94:", '/%E3%91%94:'); + } + + function test_validate_path_rootless() { + $this->assertValidation("mailto:\xE3\x91\x94:", 'mailto:%E3%91%94:'); + } + + function test_validate_path_noscheme() { + $this->assertValidation("\xE3\x91\x94", '%E3%91%94'); + } + + function test_validate_path_empty() { + $this->assertValidation('http://google.com'); + } + } -- 2.11.4.GIT