4 * HTML Purifier's internal representation of a URI.
6 * Internal data-structures are completely escaped. If the data needs
7 * to be used in a non-URI context (which is very unlikely), be sure
8 * to decode it first. The URI may not necessarily be well-formed until
9 * validate() is called.
11 class HTMLPurifier_URI
49 * @param string $scheme
50 * @param string $userinfo
54 * @param string $query
55 * @param string $fragment
56 * @note Automatically normalizes scheme and port
58 public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment)
60 $this->scheme
= is_null($scheme) ||
ctype_lower($scheme) ?
$scheme : strtolower($scheme);
61 $this->userinfo
= $userinfo;
63 $this->port
= is_null($port) ?
$port : (int)$port;
65 $this->query
= $query;
66 $this->fragment
= $fragment;
70 * Retrieves a scheme object corresponding to the URI's scheme/default
71 * @param HTMLPurifier_Config $config
72 * @param HTMLPurifier_Context $context
73 * @return HTMLPurifier_URIScheme Scheme object appropriate for validating this URI
75 public function getSchemeObj($config, $context)
77 $registry = HTMLPurifier_URISchemeRegistry
::instance();
78 if ($this->scheme
!== null) {
79 $scheme_obj = $registry->getScheme($this->scheme
, $config, $context);
82 } // invalid scheme, clean it out
84 // no scheme: retrieve the default one
85 $def = $config->getDefinition('URI');
86 $scheme_obj = $def->getDefaultScheme($config, $context);
88 if ($def->defaultScheme
!== null) {
89 // something funky happened to the default scheme object
91 'Default scheme object "' . $def->defaultScheme
. '" was not readable',
94 } // suppress error if it's null
102 * Generic validation method applicable for all schemes. May modify
103 * this URI in order to get it into a compliant form.
104 * @param HTMLPurifier_Config $config
105 * @param HTMLPurifier_Context $context
106 * @return bool True if validation/filtering succeeds, false if failure
108 public function validate($config, $context)
110 // ABNF definitions from RFC 3986
111 $chars_sub_delims = '!$&\'()*+,;=';
112 $chars_gen_delims = ':/?#[]@';
113 $chars_pchar = $chars_sub_delims . ':@';
116 if (!is_null($this->host
)) {
117 $host_def = new HTMLPurifier_AttrDef_URI_Host();
118 $this->host
= $host_def->validate($this->host
, $config, $context);
119 if ($this->host
=== false) {
125 // NOTE: It's not appropriate to check whether or not this
126 // scheme is in our registry, since a URIFilter may convert a
127 // URI that we don't allow into one we do. So instead, we just
128 // check if the scheme can be dropped because there is no host
129 // and it is our default scheme.
130 if (!is_null($this->scheme
) && is_null($this->host
) ||
$this->host
=== '') {
131 // support for relative paths is pretty abysmal when the
132 // scheme is present, so axe it when possible
133 $def = $config->getDefinition('URI');
134 if ($def->defaultScheme
=== $this->scheme
) {
135 $this->scheme
= null;
140 if (!is_null($this->userinfo
)) {
141 $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
142 $this->userinfo
= $encoder->encode($this->userinfo
);
146 if (!is_null($this->port
)) {
147 if ($this->port
< 1 ||
$this->port
> 65535) {
153 $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
154 if (!is_null($this->host
)) { // this catches $this->host === ''
155 // path-abempty (hier and relative)
156 // http://www.example.com/my/path
157 // //www.example.com/my/path (looks odd, but works, and
158 // recognized by most browsers)
159 // (this set is valid or invalid on a scheme by scheme
160 // basis, so we'll deal with it later)
163 $this->path
= $segments_encoder->encode($this->path
);
164 } elseif ($this->path
!== '') {
165 if ($this->path
[0] === '/') {
166 // path-absolute (hier and relative)
169 if (strlen($this->path
) >= 2 && $this->path
[1] === '/') {
170 // This could happen if both the host gets stripped
176 $this->path
= $segments_encoder->encode($this->path
);
178 } elseif (!is_null($this->scheme
)) {
179 // path-rootless (hier)
181 // Short circuit evaluation means we don't need to check nz
182 $this->path
= $segments_encoder->encode($this->path
);
184 // path-noscheme (relative)
186 // (once again, not checking nz)
187 $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
188 $c = strpos($this->path
, '/');
191 $segment_nc_encoder->encode(substr($this->path
, 0, $c)) .
192 $segments_encoder->encode(substr($this->path
, $c));
194 $this->path
= $segment_nc_encoder->encode($this->path
);
198 // path-empty (hier and relative)
199 $this->path
= ''; // just to be safe
202 // qf = query and fragment
203 $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
205 if (!is_null($this->query
)) {
206 $this->query
= $qf_encoder->encode($this->query
);
209 if (!is_null($this->fragment
)) {
210 $this->fragment
= $qf_encoder->encode($this->fragment
);
216 * Convert URI back to string
217 * @return string URI appropriate for output
219 public function toString()
221 // reconstruct authority
223 // there is a rendering difference between a null authority
224 // (http:foo-bar) and an empty string authority
225 // (http:///foo-bar).
226 if (!is_null($this->host
)) {
228 if (!is_null($this->userinfo
)) {
229 $authority .= $this->userinfo
. '@';
231 $authority .= $this->host
;
232 if (!is_null($this->port
)) {
233 $authority .= ':' . $this->port
;
237 // Reconstruct the result
238 // One might wonder about parsing quirks from browsers after
239 // this reconstruction. Unfortunately, parsing behavior depends
240 // on what *scheme* was employed (file:///foo is handled *very*
241 // differently than http:///foo), so unfortunately we have to
242 // defer to the schemes to do the right thing.
244 if (!is_null($this->scheme
)) {
245 $result .= $this->scheme
. ':';
247 if (!is_null($authority)) {
248 $result .= '//' . $authority;
250 $result .= $this->path
;
251 if (!is_null($this->query
)) {
252 $result .= '?' . $this->query
;
254 if (!is_null($this->fragment
)) {
255 $result .= '#' . $this->fragment
;
262 * Returns true if this URL might be considered a 'local' URL given
263 * the current context. This is true when the host is null, or
264 * when it matches the host supplied to the configuration.
266 * Note that this does not do any scheme checking, so it is mostly
267 * only appropriate for metadata that doesn't care about protocol
268 * security. isBenign is probably what you actually want.
269 * @param HTMLPurifier_Config $config
270 * @param HTMLPurifier_Context $context
273 public function isLocal($config, $context)
275 if ($this->host
=== null) {
278 $uri_def = $config->getDefinition('URI');
279 if ($uri_def->host
=== $this->host
) {
286 * Returns true if this URL should be considered a 'benign' URL,
289 * - It is a local URL (isLocal), and
290 * - It has a equal or better level of security
291 * @param HTMLPurifier_Config $config
292 * @param HTMLPurifier_Context $context
295 public function isBenign($config, $context)
297 if (!$this->isLocal($config, $context)) {
301 $scheme_obj = $this->getSchemeObj($config, $context);
304 } // conservative approach
306 $current_scheme_obj = $config->getDefinition('URI')->getDefaultScheme($config, $context);
307 if ($current_scheme_obj->secure
) {
308 if (!$scheme_obj->secure
) {
316 // vim: et sw=4 sts=4