Don't add nofollow for matching hosts, generalize this code.
[htmlpurifier.git] / library / HTMLPurifier / URI.php
blob79b7a71f156049373e6e0c317732b82bc147191c
1 <?php
3 /**
4 * HTML Purifier's internal representation of a URI.
5 * @note
6 * Internal data-structures are completely escaped. If the data needs
7 * to be used in a non-URI context (which is very unlikely), be sure
8 * to decode it first. The URI may not necessarily be well-formed until
9 * validate() is called.
11 class HTMLPurifier_URI
14 public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
16 /**
17 * @note Automatically normalizes scheme and port
19 public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
20 $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
21 $this->userinfo = $userinfo;
22 $this->host = $host;
23 $this->port = is_null($port) ? $port : (int) $port;
24 $this->path = $path;
25 $this->query = $query;
26 $this->fragment = $fragment;
29 /**
30 * Retrieves a scheme object corresponding to the URI's scheme/default
31 * @param $config Instance of HTMLPurifier_Config
32 * @param $context Instance of HTMLPurifier_Context
33 * @return Scheme object appropriate for validating this URI
35 public function getSchemeObj($config, $context) {
36 $registry = HTMLPurifier_URISchemeRegistry::instance();
37 if ($this->scheme !== null) {
38 $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
39 if (!$scheme_obj) return false; // invalid scheme, clean it out
40 } else {
41 // no scheme: retrieve the default one
42 $def = $config->getDefinition('URI');
43 $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
44 if (!$scheme_obj) {
45 // something funky happened to the default scheme object
46 trigger_error(
47 'Default scheme object "' . $def->defaultScheme . '" was not readable',
48 E_USER_WARNING
50 return false;
53 return $scheme_obj;
56 /**
57 * Generic validation method applicable for all schemes. May modify
58 * this URI in order to get it into a compliant form.
59 * @param $config Instance of HTMLPurifier_Config
60 * @param $context Instance of HTMLPurifier_Context
61 * @return True if validation/filtering succeeds, false if failure
63 public function validate($config, $context) {
65 // ABNF definitions from RFC 3986
66 $chars_sub_delims = '!$&\'()*+,;=';
67 $chars_gen_delims = ':/?#[]@';
68 $chars_pchar = $chars_sub_delims . ':@';
70 // validate host
71 if (!is_null($this->host)) {
72 $host_def = new HTMLPurifier_AttrDef_URI_Host();
73 $this->host = $host_def->validate($this->host, $config, $context);
74 if ($this->host === false) $this->host = null;
77 // validate scheme
78 // NOTE: It's not appropriate to check whether or not this
79 // scheme is in our registry, since a URIFilter may convert a
80 // URI that we don't allow into one we do. So instead, we just
81 // check if the scheme can be dropped because there is no host
82 // and it is our default scheme.
83 if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
84 // support for relative paths is pretty abysmal when the
85 // scheme is present, so axe it when possible
86 $def = $config->getDefinition('URI');
87 if ($def->defaultScheme === $this->scheme) {
88 $this->scheme = null;
92 // validate username
93 if (!is_null($this->userinfo)) {
94 $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
95 $this->userinfo = $encoder->encode($this->userinfo);
98 // validate port
99 if (!is_null($this->port)) {
100 if ($this->port < 1 || $this->port > 65535) $this->port = null;
103 // validate path
104 $path_parts = array();
105 $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
106 if (!is_null($this->host)) { // this catches $this->host === ''
107 // path-abempty (hier and relative)
108 // http://www.example.com/my/path
109 // //www.example.com/my/path (looks odd, but works, and
110 // recognized by most browsers)
111 // (this set is valid or invalid on a scheme by scheme
112 // basis, so we'll deal with it later)
113 // file:///my/path
114 // ///my/path
115 $this->path = $segments_encoder->encode($this->path);
116 } elseif ($this->path !== '') {
117 if ($this->path[0] === '/') {
118 // path-absolute (hier and relative)
119 // http:/my/path
120 // /my/path
121 if (strlen($this->path) >= 2 && $this->path[1] === '/') {
122 // This could happen if both the host gets stripped
123 // out
124 // http://my/path
125 // //my/path
126 $this->path = '';
127 } else {
128 $this->path = $segments_encoder->encode($this->path);
130 } elseif (!is_null($this->scheme)) {
131 // path-rootless (hier)
132 // http:my/path
133 // Short circuit evaluation means we don't need to check nz
134 $this->path = $segments_encoder->encode($this->path);
135 } else {
136 // path-noscheme (relative)
137 // my/path
138 // (once again, not checking nz)
139 $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
140 $c = strpos($this->path, '/');
141 if ($c !== false) {
142 $this->path =
143 $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
144 $segments_encoder->encode(substr($this->path, $c));
145 } else {
146 $this->path = $segment_nc_encoder->encode($this->path);
149 } else {
150 // path-empty (hier and relative)
151 $this->path = ''; // just to be safe
154 // qf = query and fragment
155 $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
157 if (!is_null($this->query)) {
158 $this->query = $qf_encoder->encode($this->query);
161 if (!is_null($this->fragment)) {
162 $this->fragment = $qf_encoder->encode($this->fragment);
165 return true;
170 * Convert URI back to string
171 * @return String URI appropriate for output
173 public function toString() {
174 // reconstruct authority
175 $authority = null;
176 // there is a rendering difference between a null authority
177 // (http:foo-bar) and an empty string authority
178 // (http:///foo-bar).
179 if (!is_null($this->host)) {
180 $authority = '';
181 if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
182 $authority .= $this->host;
183 if(!is_null($this->port)) $authority .= ':' . $this->port;
186 // Reconstruct the result
187 // One might wonder about parsing quirks from browsers after
188 // this reconstruction. Unfortunately, parsing behavior depends
189 // on what *scheme* was employed (file:///foo is handled *very*
190 // differently than http:///foo), so unfortunately we have to
191 // defer to the schemes to do the right thing.
192 $result = '';
193 if (!is_null($this->scheme)) $result .= $this->scheme . ':';
194 if (!is_null($authority)) $result .= '//' . $authority;
195 $result .= $this->path;
196 if (!is_null($this->query)) $result .= '?' . $this->query;
197 if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
199 return $result;
203 * Returns true if this URL might be considered a 'local' URL given
204 * the current context. This is true when the host is null, or
205 * when it matches the host supplied to the configuration.
207 * Note that this does not do any scheme checking (URI.Munge, I'm
208 * looking at you).
210 public function isLocal($config, $context) {
211 if ($this->host === null) return true;
212 $uri_def = $config->getDefinition('URI');
213 if ($uri_def->host === $this->host) return true;
214 return false;
219 // vim: et sw=4 sts=4