Issue #5768: Change to Unicode output logic and test case for same.
[python.git] / Lib / urlparse.py
blob5e5d37d132c1231a87c7f770b4a5c24a5cc360b7
1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4 UC Irvine, June 1995.
5 """
7 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
8 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
10 # A classification of schemes ('' means apply by default)
11 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
12 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
14 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
15 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
17 'svn', 'svn+ssh', 'sftp','nfs']
18 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
19 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
22 'mms', '', 'sftp']
23 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
24 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
26 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
32 '0123456789'
33 '+-.')
35 MAX_CACHE_SIZE = 20
36 _parse_cache = {}
38 def clear_cache():
39 """Clear the parse cache."""
40 _parse_cache.clear()
43 class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
46 @property
47 def username(self):
48 netloc = self.netloc
49 if "@" in netloc:
50 userinfo = netloc.rsplit("@", 1)[0]
51 if ":" in userinfo:
52 userinfo = userinfo.split(":", 1)[0]
53 return userinfo
54 return None
56 @property
57 def password(self):
58 netloc = self.netloc
59 if "@" in netloc:
60 userinfo = netloc.rsplit("@", 1)[0]
61 if ":" in userinfo:
62 return userinfo.split(":", 1)[1]
63 return None
65 @property
66 def hostname(self):
67 netloc = self.netloc
68 if "@" in netloc:
69 netloc = netloc.rsplit("@", 1)[1]
70 if ":" in netloc:
71 netloc = netloc.split(":", 1)[0]
72 return netloc.lower() or None
74 @property
75 def port(self):
76 netloc = self.netloc
77 if "@" in netloc:
78 netloc = netloc.rsplit("@", 1)[1]
79 if ":" in netloc:
80 port = netloc.split(":", 1)[1]
81 return int(port, 10)
82 return None
84 from collections import namedtuple
86 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
88 __slots__ = ()
90 def geturl(self):
91 return urlunsplit(self)
94 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
96 __slots__ = ()
98 def geturl(self):
99 return urlunparse(self)
102 def urlparse(url, scheme='', allow_fragments=True):
103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
108 tuple = urlsplit(url, scheme, allow_fragments)
109 scheme, netloc, url, query, fragment = tuple
110 if scheme in uses_params and ';' in url:
111 url, params = _splitparams(url)
112 else:
113 params = ''
114 return ParseResult(scheme, netloc, url, params, query, fragment)
116 def _splitparams(url):
117 if '/' in url:
118 i = url.find(';', url.rfind('/'))
119 if i < 0:
120 return url, ''
121 else:
122 i = url.find(';')
123 return url[:i], url[i+1:]
125 def _splitnetloc(url, start=0):
126 delim = len(url) # position of end of domain part of url, default is end
127 for c in '/?#': # look for delimiters; the order is NOT important
128 wdelim = url.find(c, start) # find first of this delim
129 if wdelim >= 0: # if found
130 delim = min(delim, wdelim) # use earliest delim position
131 return url[start:delim], url[delim:] # return (domain, rest)
133 def urlsplit(url, scheme='', allow_fragments=True):
134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 allow_fragments = bool(allow_fragments)
140 key = url, scheme, allow_fragments, type(url), type(scheme)
141 cached = _parse_cache.get(key, None)
142 if cached:
143 return cached
144 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
145 clear_cache()
146 netloc = query = fragment = ''
147 i = url.find(':')
148 if i > 0:
149 if url[:i] == 'http': # optimize the common case
150 scheme = url[:i].lower()
151 url = url[i+1:]
152 if url[:2] == '//':
153 netloc, url = _splitnetloc(url, 2)
154 if allow_fragments and '#' in url:
155 url, fragment = url.split('#', 1)
156 if '?' in url:
157 url, query = url.split('?', 1)
158 v = SplitResult(scheme, netloc, url, query, fragment)
159 _parse_cache[key] = v
160 return v
161 for c in url[:i]:
162 if c not in scheme_chars:
163 break
164 else:
165 scheme, url = url[:i].lower(), url[i+1:]
166 if scheme in uses_netloc and url[:2] == '//':
167 netloc, url = _splitnetloc(url, 2)
168 if allow_fragments and scheme in uses_fragment and '#' in url:
169 url, fragment = url.split('#', 1)
170 if scheme in uses_query and '?' in url:
171 url, query = url.split('?', 1)
172 v = SplitResult(scheme, netloc, url, query, fragment)
173 _parse_cache[key] = v
174 return v
176 def urlunparse(data):
177 """Put a parsed URL back together again. This may result in a
178 slightly different, but equivalent URL, if the URL that was parsed
179 originally had redundant delimiters, e.g. a ? with an empty query
180 (the draft states that these are equivalent)."""
181 scheme, netloc, url, params, query, fragment = data
182 if params:
183 url = "%s;%s" % (url, params)
184 return urlunsplit((scheme, netloc, url, query, fragment))
186 def urlunsplit(data):
187 scheme, netloc, url, query, fragment = data
188 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
189 if url and url[:1] != '/': url = '/' + url
190 url = '//' + (netloc or '') + url
191 if scheme:
192 url = scheme + ':' + url
193 if query:
194 url = url + '?' + query
195 if fragment:
196 url = url + '#' + fragment
197 return url
199 def urljoin(base, url, allow_fragments=True):
200 """Join a base URL and a possibly relative URL to form an absolute
201 interpretation of the latter."""
202 if not base:
203 return url
204 if not url:
205 return base
206 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
207 urlparse(base, '', allow_fragments)
208 scheme, netloc, path, params, query, fragment = \
209 urlparse(url, bscheme, allow_fragments)
210 if scheme != bscheme or scheme not in uses_relative:
211 return url
212 if scheme in uses_netloc:
213 if netloc:
214 return urlunparse((scheme, netloc, path,
215 params, query, fragment))
216 netloc = bnetloc
217 if path[:1] == '/':
218 return urlunparse((scheme, netloc, path,
219 params, query, fragment))
220 if not path:
221 path = bpath
222 if not params:
223 params = bparams
224 else:
225 path = path[:-1]
226 return urlunparse((scheme, netloc, path,
227 params, query, fragment))
228 if not query:
229 query = bquery
230 return urlunparse((scheme, netloc, path,
231 params, query, fragment))
232 segments = bpath.split('/')[:-1] + path.split('/')
233 # XXX The stuff below is bogus in various ways...
234 if segments[-1] == '.':
235 segments[-1] = ''
236 while '.' in segments:
237 segments.remove('.')
238 while 1:
239 i = 1
240 n = len(segments) - 1
241 while i < n:
242 if (segments[i] == '..'
243 and segments[i-1] not in ('', '..')):
244 del segments[i-1:i+1]
245 break
246 i = i+1
247 else:
248 break
249 if segments == ['', '..']:
250 segments[-1] = ''
251 elif len(segments) >= 2 and segments[-1] == '..':
252 segments[-2:] = ['']
253 return urlunparse((scheme, netloc, '/'.join(segments),
254 params, query, fragment))
256 def urldefrag(url):
257 """Removes any existing fragment from URL.
259 Returns a tuple of the defragmented URL and the fragment. If
260 the URL contained no fragments, the second element is the
261 empty string.
263 if '#' in url:
264 s, n, p, a, q, frag = urlparse(url)
265 defrag = urlunparse((s, n, p, a, q, ''))
266 return defrag, frag
267 else:
268 return url, ''
270 # unquote method for parse_qs and parse_qsl
271 # Cannot use directly from urllib as it would create circular reference.
272 # urllib uses urlparse methods ( urljoin)
274 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
275 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
277 def unquote(s):
278 """unquote('abc%20def') -> 'abc def'."""
279 res = s.split('%')
280 for i in xrange(1, len(res)):
281 item = res[i]
282 try:
283 res[i] = _hextochr[item[:2]] + item[2:]
284 except KeyError:
285 res[i] = '%' + item
286 except UnicodeDecodeError:
287 res[i] = unichr(int(item[:2], 16)) + item[2:]
288 return "".join(res)
290 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
291 """Parse a query given as a string argument.
293 Arguments:
295 qs: URL-encoded query string to be parsed
297 keep_blank_values: flag indicating whether blank values in
298 URL encoded queries should be treated as blank strings.
299 A true value indicates that blanks should be retained as
300 blank strings. The default false value indicates that
301 blank values are to be ignored and treated as if they were
302 not included.
304 strict_parsing: flag indicating what to do with parsing errors.
305 If false (the default), errors are silently ignored.
306 If true, errors raise a ValueError exception.
308 dict = {}
309 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
310 if name in dict:
311 dict[name].append(value)
312 else:
313 dict[name] = [value]
314 return dict
316 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
317 """Parse a query given as a string argument.
319 Arguments:
321 qs: URL-encoded query string to be parsed
323 keep_blank_values: flag indicating whether blank values in
324 URL encoded queries should be treated as blank strings. A
325 true value indicates that blanks should be retained as blank
326 strings. The default false value indicates that blank values
327 are to be ignored and treated as if they were not included.
329 strict_parsing: flag indicating what to do with parsing errors. If
330 false (the default), errors are silently ignored. If true,
331 errors raise a ValueError exception.
333 Returns a list, as G-d intended.
335 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
336 r = []
337 for name_value in pairs:
338 if not name_value and not strict_parsing:
339 continue
340 nv = name_value.split('=', 1)
341 if len(nv) != 2:
342 if strict_parsing:
343 raise ValueError, "bad query field: %r" % (name_value,)
344 # Handle case of a control-name with no equal sign
345 if keep_blank_values:
346 nv.append('')
347 else:
348 continue
349 if len(nv[1]) or keep_blank_values:
350 name = unquote(nv[0].replace('+', ' '))
351 value = unquote(nv[1].replace('+', ' '))
352 r.append((name, value))
354 return r
357 test_input = """
358 http://a/b/c/d
360 g:h = <URL:g:h>
361 http:g = <URL:http://a/b/c/g>
362 http: = <URL:http://a/b/c/d>
363 g = <URL:http://a/b/c/g>
364 ./g = <URL:http://a/b/c/g>
365 g/ = <URL:http://a/b/c/g/>
366 /g = <URL:http://a/g>
367 //g = <URL:http://g>
368 ?y = <URL:http://a/b/c/d?y>
369 g?y = <URL:http://a/b/c/g?y>
370 g?y/./x = <URL:http://a/b/c/g?y/./x>
371 . = <URL:http://a/b/c/>
372 ./ = <URL:http://a/b/c/>
373 .. = <URL:http://a/b/>
374 ../ = <URL:http://a/b/>
375 ../g = <URL:http://a/b/g>
376 ../.. = <URL:http://a/>
377 ../../g = <URL:http://a/g>
378 ../../../g = <URL:http://a/../g>
379 ./../g = <URL:http://a/b/g>
380 ./g/. = <URL:http://a/b/c/g/>
381 /./g = <URL:http://a/./g>
382 g/./h = <URL:http://a/b/c/g/h>
383 g/../h = <URL:http://a/b/c/h>
384 http:g = <URL:http://a/b/c/g>
385 http: = <URL:http://a/b/c/d>
386 http:?y = <URL:http://a/b/c/d?y>
387 http:g?y = <URL:http://a/b/c/g?y>
388 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
391 def test():
392 import sys
393 base = ''
394 if sys.argv[1:]:
395 fn = sys.argv[1]
396 if fn == '-':
397 fp = sys.stdin
398 else:
399 fp = open(fn)
400 else:
401 try:
402 from cStringIO import StringIO
403 except ImportError:
404 from StringIO import StringIO
405 fp = StringIO(test_input)
406 for line in fp:
407 words = line.split()
408 if not words:
409 continue
410 url = words[0]
411 parts = urlparse(url)
412 print '%-10s : %s' % (url, parts)
413 abs = urljoin(base, url)
414 if not base:
415 base = abs
416 wrapped = '<URL:%s>' % abs
417 print '%-10s = %s' % (url, wrapped)
418 if len(words) == 3 and words[1] == '=':
419 if wrapped != words[2]:
420 print 'EXPECTED', words[2], '!!!!!!!!!!'
422 if __name__ == '__main__':
423 test()