1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
7 __all__
= ["urlparse", "urlunparse", "urljoin", "urldefrag",
8 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
10 # A classification of schemes ('' means apply by default)
11 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'imap',
12 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
14 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet',
15 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
17 'svn', 'svn+ssh', 'sftp','nfs']
18 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news',
19 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
20 uses_params
= ['ftp', 'hdl', 'prospero', 'http', 'imap',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
23 uses_query
= ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
24 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
25 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news',
26 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars
= ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
39 """Clear the parse cache."""
43 class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
50 userinfo
= netloc
.rsplit("@", 1)[0]
52 userinfo
= userinfo
.split(":", 1)[0]
60 userinfo
= netloc
.rsplit("@", 1)[0]
62 return userinfo
.split(":", 1)[1]
69 netloc
= netloc
.rsplit("@", 1)[1]
71 netloc
= netloc
.split(":", 1)[0]
72 return netloc
.lower() or None
78 netloc
= netloc
.rsplit("@", 1)[1]
80 port
= netloc
.split(":", 1)[1]
84 from collections
import namedtuple
86 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin
):
91 return urlunsplit(self
)
94 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin
):
99 return urlunparse(self
)
102 def urlparse(url
, scheme
='', allow_fragments
=True):
103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
108 tuple = urlsplit(url
, scheme
, allow_fragments
)
109 scheme
, netloc
, url
, query
, fragment
= tuple
110 if scheme
in uses_params
and ';' in url
:
111 url
, params
= _splitparams(url
)
114 return ParseResult(scheme
, netloc
, url
, params
, query
, fragment
)
116 def _splitparams(url
):
118 i
= url
.find(';', url
.rfind('/'))
123 return url
[:i
], url
[i
+1:]
125 def _splitnetloc(url
, start
=0):
126 delim
= len(url
) # position of end of domain part of url, default is end
127 for c
in '/?#': # look for delimiters; the order is NOT important
128 wdelim
= url
.find(c
, start
) # find first of this delim
129 if wdelim
>= 0: # if found
130 delim
= min(delim
, wdelim
) # use earliest delim position
131 return url
[start
:delim
], url
[delim
:] # return (domain, rest)
133 def urlsplit(url
, scheme
='', allow_fragments
=True):
134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 allow_fragments
= bool(allow_fragments
)
140 key
= url
, scheme
, allow_fragments
, type(url
), type(scheme
)
141 cached
= _parse_cache
.get(key
, None)
144 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
146 netloc
= query
= fragment
= ''
149 if url
[:i
] == 'http': # optimize the common case
150 scheme
= url
[:i
].lower()
153 netloc
, url
= _splitnetloc(url
, 2)
154 if allow_fragments
and '#' in url
:
155 url
, fragment
= url
.split('#', 1)
157 url
, query
= url
.split('?', 1)
158 v
= SplitResult(scheme
, netloc
, url
, query
, fragment
)
159 _parse_cache
[key
] = v
162 if c
not in scheme_chars
:
165 scheme
, url
= url
[:i
].lower(), url
[i
+1:]
166 if scheme
in uses_netloc
and url
[:2] == '//':
167 netloc
, url
= _splitnetloc(url
, 2)
168 if allow_fragments
and scheme
in uses_fragment
and '#' in url
:
169 url
, fragment
= url
.split('#', 1)
170 if scheme
in uses_query
and '?' in url
:
171 url
, query
= url
.split('?', 1)
172 v
= SplitResult(scheme
, netloc
, url
, query
, fragment
)
173 _parse_cache
[key
] = v
176 def urlunparse(data
):
177 """Put a parsed URL back together again. This may result in a
178 slightly different, but equivalent URL, if the URL that was parsed
179 originally had redundant delimiters, e.g. a ? with an empty query
180 (the draft states that these are equivalent)."""
181 scheme
, netloc
, url
, params
, query
, fragment
= data
183 url
= "%s;%s" % (url
, params
)
184 return urlunsplit((scheme
, netloc
, url
, query
, fragment
))
186 def urlunsplit(data
):
187 scheme
, netloc
, url
, query
, fragment
= data
188 if netloc
or (scheme
and scheme
in uses_netloc
and url
[:2] != '//'):
189 if url
and url
[:1] != '/': url
= '/' + url
190 url
= '//' + (netloc
or '') + url
192 url
= scheme
+ ':' + url
194 url
= url
+ '?' + query
196 url
= url
+ '#' + fragment
199 def urljoin(base
, url
, allow_fragments
=True):
200 """Join a base URL and a possibly relative URL to form an absolute
201 interpretation of the latter."""
206 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
207 urlparse(base
, '', allow_fragments
)
208 scheme
, netloc
, path
, params
, query
, fragment
= \
209 urlparse(url
, bscheme
, allow_fragments
)
210 if scheme
!= bscheme
or scheme
not in uses_relative
:
212 if scheme
in uses_netloc
:
214 return urlunparse((scheme
, netloc
, path
,
215 params
, query
, fragment
))
218 return urlunparse((scheme
, netloc
, path
,
219 params
, query
, fragment
))
226 return urlunparse((scheme
, netloc
, path
,
227 params
, query
, fragment
))
230 return urlunparse((scheme
, netloc
, path
,
231 params
, query
, fragment
))
232 segments
= bpath
.split('/')[:-1] + path
.split('/')
233 # XXX The stuff below is bogus in various ways...
234 if segments
[-1] == '.':
236 while '.' in segments
:
240 n
= len(segments
) - 1
242 if (segments
[i
] == '..'
243 and segments
[i
-1] not in ('', '..')):
244 del segments
[i
-1:i
+1]
249 if segments
== ['', '..']:
251 elif len(segments
) >= 2 and segments
[-1] == '..':
253 return urlunparse((scheme
, netloc
, '/'.join(segments
),
254 params
, query
, fragment
))
257 """Removes any existing fragment from URL.
259 Returns a tuple of the defragmented URL and the fragment. If
260 the URL contained no fragments, the second element is the
264 s
, n
, p
, a
, q
, frag
= urlparse(url
)
265 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
270 # unquote method for parse_qs and parse_qsl
271 # Cannot use directly from urllib as it would create circular reference.
272 # urllib uses urlparse methods ( urljoin)
274 _hextochr
= dict(('%02x' % i
, chr(i
)) for i
in range(256))
275 _hextochr
.update(('%02X' % i
, chr(i
)) for i
in range(256))
278 """unquote('abc%20def') -> 'abc def'."""
280 for i
in xrange(1, len(res
)):
283 res
[i
] = _hextochr
[item
[:2]] + item
[2:]
286 except UnicodeDecodeError:
287 res
[i
] = unichr(int(item
[:2], 16)) + item
[2:]
290 def parse_qs(qs
, keep_blank_values
=0, strict_parsing
=0):
291 """Parse a query given as a string argument.
295 qs: URL-encoded query string to be parsed
297 keep_blank_values: flag indicating whether blank values in
298 URL encoded queries should be treated as blank strings.
299 A true value indicates that blanks should be retained as
300 blank strings. The default false value indicates that
301 blank values are to be ignored and treated as if they were
304 strict_parsing: flag indicating what to do with parsing errors.
305 If false (the default), errors are silently ignored.
306 If true, errors raise a ValueError exception.
309 for name
, value
in parse_qsl(qs
, keep_blank_values
, strict_parsing
):
311 dict[name
].append(value
)
316 def parse_qsl(qs
, keep_blank_values
=0, strict_parsing
=0):
317 """Parse a query given as a string argument.
321 qs: URL-encoded query string to be parsed
323 keep_blank_values: flag indicating whether blank values in
324 URL encoded queries should be treated as blank strings. A
325 true value indicates that blanks should be retained as blank
326 strings. The default false value indicates that blank values
327 are to be ignored and treated as if they were not included.
329 strict_parsing: flag indicating what to do with parsing errors. If
330 false (the default), errors are silently ignored. If true,
331 errors raise a ValueError exception.
333 Returns a list, as G-d intended.
335 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
337 for name_value
in pairs
:
338 if not name_value
and not strict_parsing
:
340 nv
= name_value
.split('=', 1)
343 raise ValueError, "bad query field: %r" % (name_value
,)
344 # Handle case of a control-name with no equal sign
345 if keep_blank_values
:
349 if len(nv
[1]) or keep_blank_values
:
350 name
= unquote(nv
[0].replace('+', ' '))
351 value
= unquote(nv
[1].replace('+', ' '))
352 r
.append((name
, value
))
361 http:g = <URL:http://a/b/c/g>
362 http: = <URL:http://a/b/c/d>
363 g = <URL:http://a/b/c/g>
364 ./g = <URL:http://a/b/c/g>
365 g/ = <URL:http://a/b/c/g/>
366 /g = <URL:http://a/g>
368 ?y = <URL:http://a/b/c/d?y>
369 g?y = <URL:http://a/b/c/g?y>
370 g?y/./x = <URL:http://a/b/c/g?y/./x>
371 . = <URL:http://a/b/c/>
372 ./ = <URL:http://a/b/c/>
373 .. = <URL:http://a/b/>
374 ../ = <URL:http://a/b/>
375 ../g = <URL:http://a/b/g>
376 ../.. = <URL:http://a/>
377 ../../g = <URL:http://a/g>
378 ../../../g = <URL:http://a/../g>
379 ./../g = <URL:http://a/b/g>
380 ./g/. = <URL:http://a/b/c/g/>
381 /./g = <URL:http://a/./g>
382 g/./h = <URL:http://a/b/c/g/h>
383 g/../h = <URL:http://a/b/c/h>
384 http:g = <URL:http://a/b/c/g>
385 http: = <URL:http://a/b/c/d>
386 http:?y = <URL:http://a/b/c/d?y>
387 http:g?y = <URL:http://a/b/c/g?y>
388 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
402 from cStringIO
import StringIO
404 from StringIO
import StringIO
405 fp
= StringIO(test_input
)
411 parts
= urlparse(url
)
412 print '%-10s : %s' % (url
, parts
)
413 abs = urljoin(base
, url
)
416 wrapped
= '<URL:%s>' % abs
417 print '%-10s = %s' % (url
, wrapped
)
418 if len(words
) == 3 and words
[1] == '=':
419 if wrapped
!= words
[2]:
420 print 'EXPECTED', words
[2], '!!!!!!!!!!'
422 if __name__
== '__main__':