From cb5d5d0648a93e2a8043a10aa790cef2c6d87ad2 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Wed, 14 May 2008 02:19:00 +0000
Subject: [PATCH] [3.1.0] Revamp URI handling of percent encoding and
 validation.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1709 48356398-32a2-884e-a903-53898d9a118a
---
 NEWS                                        |  2 +
 TODO                                        |  3 ++
 configdoc/usage.xml                         |  4 +-
 library/HTMLPurifier/AttrDef/URI.php        | 12 +----
 library/HTMLPurifier/AttrDef/URI/Host.php   | 20 +++++++--
 library/HTMLPurifier/PercentEncoder.php     | 65 ++++++++++++++++++++++++---
 library/HTMLPurifier/URI.php                | 70 +++++++++++++++++++++++++----
 library/HTMLPurifier/URIParser.php          | 41 ++++++++++-------
 tests/HTMLPurifier/AttrDef/URI/HostTest.php | 21 +++++++++
 tests/HTMLPurifier/AttrDef/URITest.php      | 13 ++++++
 tests/HTMLPurifier/PercentEncoderTest.php   | 23 ++++++++++
 tests/HTMLPurifier/URIParserTest.php        |  9 +++-
 tests/HTMLPurifier/URITest.php              | 28 ++++++++++++
 13 files changed, 261 insertions(+), 50 deletions(-)
diff --git a/NEWS b/NEWS
index 8f9d14cc..39bdadcd 100644
--- a/NEWS
+++ b/NEWS
@@ -32,6 +32,8 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 ! Commas, not dashes, used for serializer IDs. This change is forwards-compatible
   and allows for version numbers like "3.1.0-dev".
 ! %HTML.Allowed deals gracefully with whitespace anywhere, anytime!
+! HTML Purifier's URI handling is a lot more robust, with much stricter
+  validation checks and better percent encoding handling.
 - InterchangeBuilder now alphabetizes its lists
 - Validation error in configdoc output fixed
 - Iconv and other encoding errors muted even with custom error handlers that
diff --git a/TODO b/TODO
index eca4768c..e19a2ed5 100644
--- a/TODO
+++ b/TODO
@@ -11,6 +11,8 @@ If no interest is expressed for a feature that may require a considerable
 amount of effort to implement, it may get endlessly delayed. Do not be
 afraid to cast your vote for the next feature to be implemented!
 
+- Implement validation for query and for fragment
+
 FUTURE VERSIONS
 ---------------
 
@@ -47,6 +49,7 @@ FUTURE VERSIONS
    AttrDef class). Probably will use CSSTidy class?
  # More control over allowed CSS properties using a modularization
  # HTML 5 support
+ # IRI support
  - Standardize token armor for all areas of processing
  - Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
    Also, enable disabling of directionality
diff --git a/configdoc/usage.xml b/configdoc/usage.xml
index 0d7346b3..0929548d 100644
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@@ -215,12 +215,12 @@
  </directive>
  <directive id="URI.Disable">
   <file name="HTMLPurifier/AttrDef/URI.php">
-   <line>24</line>
+   <line>23</line>
   </file>
  </directive>
  <directive id="URI.Munge">
   <file name="HTMLPurifier/AttrDef/URI.php">
-   <line>78</line>
+   <line>68</line>
   </file>
  </directive>
  <directive id="Core.ColorKeywords">
diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php
index 5afc1c1b..9c632f1a 100644
--- a/library/HTMLPurifier/AttrDef/URI.php
+++ b/library/HTMLPurifier/AttrDef/URI.php
@@ -7,7 +7,7 @@
 class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
 {
     
-    protected $parser, $percentEncoder;
+    protected $parser;
     protected $embedsResource;
     
     /**
@@ -15,7 +15,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
      */
     public function __construct($embeds_resource = false) {
         $this->parser = new HTMLPurifier_URIParser();
-        $this->percentEncoder = new HTMLPurifier_PercentEncoder();
         $this->embedsResource = (bool) $embeds_resource;
     }
     
@@ -23,9 +22,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
         
         if ($config->get('URI', 'Disable')) return false;
         
-        // initial operations
         $uri = $this->parseCDATA($uri);
-        $uri = $this->percentEncoder->normalize($uri);
         
         // parse the URI
         $uri = $this->parser->parse($uri);
@@ -61,13 +58,6 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
         $context->destroy('EmbeddedURI');
         if (!$ok) return false;
         
-        // munge scheme off if necessary (this must be last)
-        if (!is_null($uri->scheme) && is_null($uri->host)) {
-            if ($uri_def->defaultScheme == $uri->scheme) {
-                $uri->scheme = null;
-            }
-        }
-        
         // back to string
         $result = $uri->toString();
         
diff --git a/library/HTMLPurifier/AttrDef/URI/Host.php b/library/HTMLPurifier/AttrDef/URI/Host.php
index 51c1b8a5..8d1a7b2c 100644
--- a/library/HTMLPurifier/AttrDef/URI/Host.php
+++ b/library/HTMLPurifier/AttrDef/URI/Host.php
@@ -36,11 +36,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
         $ipv4 = $this->ipv4->validate($string, $config, $context);
         if ($ipv4 !== false) return $ipv4;
         
-        // validate a domain name here, do filtering, etc etc etc
+        // A regular domain name.
         
-        // We could use this, but it would break I18N domain names
-        //$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string);
-        //if (!$match) return false;
+        // This breaks I18N domain names, but we don't have proper IRI support,
+        // so force users to insert Punycode. If there's complaining we'll 
+        // try to fix things into an international friendly form.
+        
+        // The productions describing this are:
+        $a   = '[a-z]';     // alpha
+        $an  = '[a-z0-9]';  // alphanum
+        $and = '[a-z0-9-]'; // alphanum | "-"
+        // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
+        $domainlabel   = "$an($and*$an)?";
+        // toplabel    = alpha | alpha *( alphanum | "-" ) alphanum
+        $toplabel      = "$a($and*$an)?";
+        // hostname    = *( domainlabel "." ) toplabel [ "." ]
+        $match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
+        if (!$match) return false;
         
         return $string;
     }
diff --git a/library/HTMLPurifier/PercentEncoder.php b/library/HTMLPurifier/PercentEncoder.php
index 9ce3a1f1..e1358675 100644
--- a/library/HTMLPurifier/PercentEncoder.php
+++ b/library/HTMLPurifier/PercentEncoder.php
@@ -2,12 +2,68 @@
 
 /**
  * Class that handles operations involving percent-encoding in URIs.
+ *
+ * @warning
+ *      Be careful when reusing instances of PercentEncoder. The object
+ *      you use for normalize() SHOULD NOT be used for encode(), or
+ *      vice-versa.
  */
 class HTMLPurifier_PercentEncoder
 {
     
     /**
-     * Fix up percent-encoding by decoding unreserved characters and normalizing
+     * Reserved characters to preserve when using encode().
+     */
+    protected $preserve = array();
+    
+    /**
+     * String of characters that should be preserved while using encode().
+     */
+    public function __construct($preserve = false) {
+        // unreserved letters, ought to const-ify
+        for ($i = 48; $i <= 57;  $i++) $this->preserve[$i] = true; // digits
+        for ($i = 65; $i <= 90;  $i++) $this->preserve[$i] = true; // upper-case
+        for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
+        $this->preserve[45] = true; // Dash         -
+        $this->preserve[46] = true; // Period       .
+        $this->preserve[95] = true; // Underscore   _
+        $this->preserve[126]= true; // Tilde        ~
+        
+        // extra letters not to escape
+        if ($preserve !== false) {
+            for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
+                $this->preserve[ord($preserve[$i])] = true;
+            }
+        }
+    }
+    
+    /**
+     * Our replacement for urlencode, it encodes all non-reserved characters,
+     * as well as any extra characters that were instructed to be preserved.
+     * @note
+     *      Assumes that the string has already been normalized, making any
+     *      and all percent escape sequences valid. Percents will not be
+     *      re-escaped, regardless of their status in $preserve
+     * @param $string String to be encoded
+     * @return Encoded string.
+     */
+    public function encode($string) {
+        $ret = '';
+        for ($i = 0, $c = strlen($string); $i < $c; $i++) {
+            if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
+                $ret .= '%' . sprintf('%02X', $int);
+            } else {
+                $ret .= $string[$i];
+            }
+        }
+        return $ret;
+    }
+    
+    /**
+     * Fix up percent-encoding by decoding unreserved characters and normalizing.
+     * @warning This function is affected by $preserve, even though the
+     *          usual desired behavior is for this not to preserve those
+     *          characters. Be careful when reusing instances of PercentEncoder!
      * @param $string String to normalize
      */
     public function normalize($string) {
@@ -27,12 +83,7 @@ class HTMLPurifier_PercentEncoder
                 continue;
             }
             $int = hexdec($encoding);
-            if (
-                ($int >= 48 && $int <= 57) || // digits
-                ($int >= 65 && $int <= 90) || // uppercase letters
-                ($int >= 97 && $int <= 122) || // lowercase letters
-                $int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._
-            ) {
+            if (isset($this->preserve[$int])) {
                 $ret .= chr($int) . $text;
                 continue;
             }
diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php
index d63b324e..43f1b192 100644
--- a/library/HTMLPurifier/URI.php
+++ b/library/HTMLPurifier/URI.php
@@ -1,7 +1,12 @@
 <?php
 
 /**
- * HTML Purifier's internal representation of a URI
+ * HTML Purifier's internal representation of a URI.
+ * @note
+ *      Internal data-structures are completely escaped. If the data needs
+ *      to be used in a non-URI context (which is very unlikely), be sure
+ *      to decode it first. The URI may not necessarily be well-formed until
+ *      validate() is called.
  */
 class HTMLPurifier_URI
 {
@@ -49,13 +54,27 @@ class HTMLPurifier_URI
     }
     
     /**
-     * Generic validation method applicable for all schemes
+     * Generic validation method applicable for all schemes. May modify
+     * this URI in order to get it into a compliant form.
      * @param $config Instance of HTMLPurifier_Config
      * @param $context Instance of HTMLPurifier_Context
      * @return True if validation/filtering succeeds, false if failure
      */
     public function validate($config, $context) {
         
+        // ABNF definitions from RFC 3986
+        $chars_sub_delims = '!$&\'()*+,;=';
+        $chars_gen_delims = ':/?#[]@';
+        $chars_pchar = $chars_sub_delims . ':@';
+        
+        // validate scheme (MUST BE FIRST!)
+        if (!is_null($this->scheme) && is_null($this->host)) {
+            $def = $config->getDefinition('URI');
+            if ($def->defaultScheme === $this->scheme) {
+                $this->scheme = null;
+            }
+        }
+        
         // validate host
         if (!is_null($this->host)) {
             $host_def = new HTMLPurifier_AttrDef_URI_Host();
@@ -63,18 +82,51 @@ class HTMLPurifier_URI
             if ($this->host === false) $this->host = null;
         }
         
+        // validate username
+        if (!is_null($this->userinfo)) {
+            $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
+            $this->userinfo = $encoder->encode($this->userinfo);
+        }
+        
         // validate port
         if (!is_null($this->port)) {
             if ($this->port < 1 || $this->port > 65535) $this->port = null;
         }
         
-        // query and fragment are quite simple in terms of definition:
-        // *( pchar / "/" / "?" ), so define their validation routines
-        // when we start fixing percent encoding
-        
-        // path gets to be validated against a hodge-podge of rules depending
-        // on the status of authority and scheme, but it's not that important,
-        // esp. since it won't be applicable to everyone
+        // validate path
+        $path_parts = array();
+        $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
+        if (!is_null($this->host)) {
+            // path-abempty (hier and relative)
+            $this->path = $segments_encoder->encode($this->path);
+        } elseif ($this->path !== '' && $this->path[0] === '/') {
+            // path-absolute (hier and relative)
+            if (strlen($this->path) >= 2 && $this->path[1] === '/') {
+                // This shouldn't ever happen!
+                $this->path = '';
+            } else {
+                $this->path = $segments_encoder->encode($this->path);
+            }
+        } elseif (!is_null($this->scheme) && $this->path !== '') {
+            // path-rootless (hier)
+            // Short circuit evaluation means we don't need to check nz
+            $this->path = $segments_encoder->encode($this->path);
+        } elseif (is_null($this->scheme) && $this->path !== '') {
+            // path-noscheme (relative)
+            // (once again, not checking nz)
+            $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
+            $c = strpos($this->path, '/');
+            if ($c !== false) {
+                $this->path = 
+                    $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
+                    $segments_encoder->encode(substr($this->path, $c));
+            } else {
+                $this->path = $segment_nc_encoder->encode($this->path);
+            }
+        } else {
+            // path-empty (hier and relative)
+            $this->path = ''; // just to be safe
+        }
         
         return true;
         
diff --git a/library/HTMLPurifier/URIParser.php b/library/HTMLPurifier/URIParser.php
index a52e5042..0ea51762 100644
--- a/library/HTMLPurifier/URIParser.php
+++ b/library/HTMLPurifier/URIParser.php
@@ -2,24 +2,39 @@
 
 /**
  * Parses a URI into the components and fragment identifier as specified
- * by RFC 2396.
- * @todo Replace regexps with a native PHP parser
+ * by RFC 3986.
  */
 class HTMLPurifier_URIParser
 {
     
     /**
-     * Parses a URI
+     * Instance of HTMLPurifier_PercentEncoder to do normalization with.
+     */
+    protected $percentEncoder;
+    
+    public function __construct() {
+        $this->percentEncoder = new HTMLPurifier_PercentEncoder();
+    }
+    
+    /**
+     * Parses a URI.
      * @param $uri string URI to parse
-     * @return HTMLPurifier_URI representation of URI
+     * @return HTMLPurifier_URI representation of URI. This representation has
+     *         not been validated yet and may not conform to RFC.
      */
     public function parse($uri) {
+        
+        $uri = $this->percentEncoder->normalize($uri);
+        
+        // Regexp is as per Appendix B.
+        // Note that ["<>] are an addition to the RFC's recommended 
+        // characters, because they represent external delimeters.
         $r_URI = '!'.
-            '(([^:/?#<>\'"]+):)?'. // 2. Scheme
-            '(//([^/?#<>\'"]*))?'. // 4. Authority
-            '([^?#<>\'"]*)'.       // 5. Path
-            '(\?([^#<>\'"]*))?'.   // 7. Query
-            '(#([^<>\'"]*))?'.     // 8. Fragment
+            '(([^:/?#"<>]+):)?'. // 2. Scheme
+            '(//([^/?#"<>]*))?'. // 4. Authority
+            '([^?#"<>]*)'.       // 5. Path
+            '(\?([^#"<>]*))?'.   // 7. Query
+            '(#([^"<>]*))?'.     // 8. Fragment
             '!';
         
         $matches = array();
@@ -36,13 +51,7 @@ class HTMLPurifier_URIParser
         
         // further parse authority
         if ($authority !== null) {
-            // ridiculously inefficient: it's a stacked regex!
-            $HEXDIG = '[A-Fa-f0-9]';
-            $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
-            $sub_delims = '!$&\'()'; // needs []
-            $pct_encoded = "%$HEXDIG$HEXDIG";
-            $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
-            $r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
+            $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
             $matches = array();
             preg_match($r_authority, $authority, $matches);
             $userinfo   = !empty($matches[1]) ? $matches[2] : null;
diff --git a/tests/HTMLPurifier/AttrDef/URI/HostTest.php b/tests/HTMLPurifier/AttrDef/URI/HostTest.php
index b05276da..4106685d 100644
--- a/tests/HTMLPurifier/AttrDef/URI/HostTest.php
+++ b/tests/HTMLPurifier/AttrDef/URI/HostTest.php
@@ -14,6 +14,27 @@ class HTMLPurifier_AttrDef_URI_HostTest extends HTMLPurifier_AttrDefHarness
         $this->assertDef('124.15.6.89'); // IPv4
         $this->assertDef('www.google.com'); // reg-name
         
+        // more domain name tests
+        $this->assertDef('test.');
+        $this->assertDef('sub.test.');
+        $this->assertDef('.test', false);
+        $this->assertDef('ff');
+        $this->assertDef('1f', false);
+        $this->assertDef('-f', false);
+        $this->assertDef('f1');
+        $this->assertDef('f-', false);
+        $this->assertDef('sub.ff');
+        $this->assertDef('sub.1f', false);
+        $this->assertDef('sub.-f', false);
+        $this->assertDef('sub.f1');
+        $this->assertDef('sub.f-', false);
+        $this->assertDef('ff.top');
+        $this->assertDef('1f.top');
+        $this->assertDef('-f.top', false);
+        $this->assertDef('ff.top');
+        $this->assertDef('f1.top');
+        $this->assertDef('f-.top', false);
+        
     }
     
 }
diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php
index 026d5d49..a22adaab 100644
--- a/tests/HTMLPurifier/AttrDef/URITest.php
+++ b/tests/HTMLPurifier/AttrDef/URITest.php
@@ -29,6 +29,19 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
         );
     }
     
+    function testPercentEncoding() {
+        $this->assertDef(
+            'http:colon:mercenary',
+            'colon%3Amercenary'
+        );
+    }
+    
+    function testPercentEncodingPreserve() {
+        $this->assertDef(
+            'http://www.example.com/abcABC123-_.!~*()\''
+        );
+    }
+    
     function testEmbeds() {
         $this->def = new HTMLPurifier_AttrDef_URI(true);
         $this->assertDef('http://sub.example.com/alas?foo=asd');
diff --git a/tests/HTMLPurifier/PercentEncoderTest.php b/tests/HTMLPurifier/PercentEncoderTest.php
index b77875fc..6df8eb7a 100644
--- a/tests/HTMLPurifier/PercentEncoderTest.php
+++ b/tests/HTMLPurifier/PercentEncoderTest.php
@@ -35,5 +35,28 @@ class HTMLPurifier_PercentEncoderTest extends HTMLPurifier_Harness
         
     }
     
+    function assertEncode($string, $expect = true, $preserve = false) {
+        if ($expect === true) $expect = $string;
+        $encoder = new HTMLPurifier_PercentEncoder($preserve);
+        $result = $encoder->encode($string);
+        $this->assertIdentical($result, $expect);
+    }
+    
+    function test_encode_noChange() {
+        $this->assertEncode('abc012-_~.');
+    }
+    
+    function test_encode_encode() {
+        $this->assertEncode('>', '%3E');
+    }
+    
+    function test_encode_preserve() {
+        $this->assertEncode('<>', '<%3E', '<');
+    }
+    
+    function test_encode_low() {
+        $this->assertEncode("\1", '%01');
+    }
+    
 }
 
diff --git a/tests/HTMLPurifier/URIParserTest.php b/tests/HTMLPurifier/URIParserTest.php
index 97c6a2d4..ed3aa499 100644
--- a/tests/HTMLPurifier/URIParserTest.php
+++ b/tests/HTMLPurifier/URIParserTest.php
@@ -13,6 +13,13 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
         $this->assertEqual($result, $expect);
     }
     
+    function testPercentNormalization() {
+        $this->assertParsing(
+            '%G',
+            null, null, null, null, '%25G', null, null
+        );
+    }
+    
     function testRegular() {
         $this->assertParsing(
             'http://www.example.com/webhp?q=foo#result2',
@@ -121,7 +128,7 @@ class HTMLPurifier_URIParserTest extends HTMLPurifier_Harness
     
     function testMalformedTag() {
         $this->assertParsing(
-            'http://www.example.com/\'>"',
+            'http://www.example.com/>',
             'http', null, 'www.example.com', null, '/', null, null
         );
     }
diff --git a/tests/HTMLPurifier/URITest.php b/tests/HTMLPurifier/URITest.php
index 0d916045..97d77927 100644
--- a/tests/HTMLPurifier/URITest.php
+++ b/tests/HTMLPurifier/URITest.php
@@ -160,4 +160,32 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness
         $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:');
     }
     
+    function test_validate_removeRedundantScheme() {
+        $this->assertValidation('http:foo:/:', 'foo%3A/:');
+    }
+    
+    function test_validate_username() {
+        $this->assertValidation("http://user\xE3\x91\x94:@foo.com", 'http://user%E3%91%94:@foo.com');
+    }
+    
+    function test_validate_path_abempty() {
+        $this->assertValidation("http://host/\xE3\x91\x94:", 'http://host/%E3%91%94:');
+    }
+    
+    function test_validate_path_absolute() {
+        $this->assertValidation("/\xE3\x91\x94:", '/%E3%91%94:');
+    }
+    
+    function test_validate_path_rootless() {
+        $this->assertValidation("mailto:\xE3\x91\x94:", 'mailto:%E3%91%94:');
+    }
+    
+    function test_validate_path_noscheme() {
+        $this->assertValidation("\xE3\x91\x94", '%E3%91%94');
+    }
+    
+    function test_validate_path_empty() {
+        $this->assertValidation('http://google.com');
+    }
+    
 }
-- 
2.11.4.GIT