4 * Parses a URI into the components and fragment identifier as specified
7 class HTMLPurifier_URIParser
11 * Instance of HTMLPurifier_PercentEncoder to do normalization with.
13 protected $percentEncoder;
15 public function __construct()
17 $this->percentEncoder
= new HTMLPurifier_PercentEncoder();
22 * @param $uri string URI to parse
23 * @return HTMLPurifier_URI representation of URI. This representation has
24 * not been validated yet and may not conform to RFC.
26 public function parse($uri)
28 $uri = $this->percentEncoder
->normalize($uri);
30 // Regexp is as per Appendix B.
31 // Note that ["<>] are an addition to the RFC's recommended
32 // characters, because they represent external delimeters.
34 '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
35 '(//([^/?#"<>]*))?'. // 4. Authority
36 '([^?#"<>]*)'. // 5. Path
37 '(\?([^#"<>]*))?'. // 7. Query
38 '(#([^"<>]*))?'. // 8. Fragment
42 $result = preg_match($r_URI, $uri, $matches);
44 if (!$result) return false; // *really* invalid URI
47 $scheme = !empty($matches[1]) ?
$matches[2] : null;
48 $authority = !empty($matches[3]) ?
$matches[4] : null;
49 $path = $matches[5]; // always present, can be empty
50 $query = !empty($matches[6]) ?
$matches[7] : null;
51 $fragment = !empty($matches[8]) ?
$matches[9] : null;
53 // further parse authority
54 if ($authority !== null) {
55 $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
57 preg_match($r_authority, $authority, $matches);
58 $userinfo = !empty($matches[1]) ?
$matches[2] : null;
59 $host = !empty($matches[3]) ?
$matches[3] : '';
60 $port = !empty($matches[4]) ?
(int) $matches[5] : null;
62 $port = $host = $userinfo = null;
65 return new HTMLPurifier_URI(
66 $scheme, $userinfo, $host, $port, $path, $query, $fragment);