1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
7 __all__
= ["urlparse", "urlunparse", "urljoin", "urldefrag",
8 "urlsplit", "urlunsplit"]
10 # A classification of schemes ('' means apply by default)
11 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'imap',
12 'wais', 'file', 'https', 'shttp', 'mms',
13 'prospero', 'rtsp', 'rtspu', '', 'sftp']
14 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet',
15 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
17 'svn', 'svn+ssh', 'sftp']
18 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news',
19 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
20 uses_params
= ['ftp', 'hdl', 'prospero', 'http', 'imap',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
23 uses_query
= ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
24 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
25 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news',
26 'nntp', 'wais', 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars
= ('abcdefghijklmnopqrstuvwxyz'
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
39 """Clear the parse cache."""
43 class ResultMixin(object):
44 """Shared methods for the parsed result objects."""
50 userinfo
= netloc
.rsplit("@", 1)[0]
52 userinfo
= userinfo
.split(":", 1)[0]
60 userinfo
= netloc
.rsplit("@", 1)[0]
62 return userinfo
.split(":", 1)[1]
69 netloc
= netloc
.rsplit("@", 1)[1]
71 netloc
= netloc
.split(":", 1)[0]
72 return netloc
.lower() or None
78 netloc
= netloc
.rsplit("@", 1)[1]
80 port
= netloc
.split(":", 1)[1]
84 from collections
import namedtuple
86 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin
):
91 return urlunsplit(self
)
94 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin
):
99 return urlunparse(self
)
102 def urlparse(url
, scheme
='', allow_fragments
=True):
103 """Parse a URL into 6 components:
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
106 Note that we don't break the components up in smaller bits
107 (e.g. netloc is a single string) and we don't expand % escapes."""
108 tuple = urlsplit(url
, scheme
, allow_fragments
)
109 scheme
, netloc
, url
, query
, fragment
= tuple
110 if scheme
in uses_params
and ';' in url
:
111 url
, params
= _splitparams(url
)
114 return ParseResult(scheme
, netloc
, url
, params
, query
, fragment
)
116 def _splitparams(url
):
118 i
= url
.find(';', url
.rfind('/'))
123 return url
[:i
], url
[i
+1:]
125 def _splitnetloc(url
, start
=0):
126 delim
= len(url
) # position of end of domain part of url, default is end
127 for c
in '/?#': # look for delimiters; the order is NOT important
128 wdelim
= url
.find(c
, start
) # find first of this delim
129 if wdelim
>= 0: # if found
130 delim
= min(delim
, wdelim
) # use earliest delim position
131 return url
[start
:delim
], url
[delim
:] # return (domain, rest)
133 def urlsplit(url
, scheme
='', allow_fragments
=True):
134 """Parse a URL into 5 components:
135 <scheme>://<netloc>/<path>?<query>#<fragment>
136 Return a 5-tuple: (scheme, netloc, path, query, fragment).
137 Note that we don't break the components up in smaller bits
138 (e.g. netloc is a single string) and we don't expand % escapes."""
139 allow_fragments
= bool(allow_fragments
)
140 key
= url
, scheme
, allow_fragments
, type(url
), type(scheme
)
141 cached
= _parse_cache
.get(key
, None)
144 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
146 netloc
= query
= fragment
= ''
149 if url
[:i
] == 'http': # optimize the common case
150 scheme
= url
[:i
].lower()
153 netloc
, url
= _splitnetloc(url
, 2)
154 if allow_fragments
and '#' in url
:
155 url
, fragment
= url
.split('#', 1)
157 url
, query
= url
.split('?', 1)
158 v
= SplitResult(scheme
, netloc
, url
, query
, fragment
)
159 _parse_cache
[key
] = v
162 if c
not in scheme_chars
:
165 scheme
, url
= url
[:i
].lower(), url
[i
+1:]
166 if scheme
in uses_netloc
and url
[:2] == '//':
167 netloc
, url
= _splitnetloc(url
, 2)
168 if allow_fragments
and scheme
in uses_fragment
and '#' in url
:
169 url
, fragment
= url
.split('#', 1)
170 if scheme
in uses_query
and '?' in url
:
171 url
, query
= url
.split('?', 1)
172 v
= SplitResult(scheme
, netloc
, url
, query
, fragment
)
173 _parse_cache
[key
] = v
176 def urlunparse(data
):
177 """Put a parsed URL back together again. This may result in a
178 slightly different, but equivalent URL, if the URL that was parsed
179 originally had redundant delimiters, e.g. a ? with an empty query
180 (the draft states that these are equivalent)."""
181 scheme
, netloc
, url
, params
, query
, fragment
= data
183 url
= "%s;%s" % (url
, params
)
184 return urlunsplit((scheme
, netloc
, url
, query
, fragment
))
186 def urlunsplit(data
):
187 scheme
, netloc
, url
, query
, fragment
= data
188 if netloc
or (scheme
and scheme
in uses_netloc
and url
[:2] != '//'):
189 if url
and url
[:1] != '/': url
= '/' + url
190 url
= '//' + (netloc
or '') + url
192 url
= scheme
+ ':' + url
194 url
= url
+ '?' + query
196 url
= url
+ '#' + fragment
199 def urljoin(base
, url
, allow_fragments
=True):
200 """Join a base URL and a possibly relative URL to form an absolute
201 interpretation of the latter."""
206 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
207 urlparse(base
, '', allow_fragments
)
208 scheme
, netloc
, path
, params
, query
, fragment
= \
209 urlparse(url
, bscheme
, allow_fragments
)
210 if scheme
!= bscheme
or scheme
not in uses_relative
:
212 if scheme
in uses_netloc
:
214 return urlunparse((scheme
, netloc
, path
,
215 params
, query
, fragment
))
218 return urlunparse((scheme
, netloc
, path
,
219 params
, query
, fragment
))
220 if not (path
or params
or query
):
221 return urlunparse((scheme
, netloc
, bpath
,
222 bparams
, bquery
, fragment
))
223 segments
= bpath
.split('/')[:-1] + path
.split('/')
224 # XXX The stuff below is bogus in various ways...
225 if segments
[-1] == '.':
227 while '.' in segments
:
231 n
= len(segments
) - 1
233 if (segments
[i
] == '..'
234 and segments
[i
-1] not in ('', '..')):
235 del segments
[i
-1:i
+1]
240 if segments
== ['', '..']:
242 elif len(segments
) >= 2 and segments
[-1] == '..':
244 return urlunparse((scheme
, netloc
, '/'.join(segments
),
245 params
, query
, fragment
))
248 """Removes any existing fragment from URL.
250 Returns a tuple of the defragmented URL and the fragment. If
251 the URL contained no fragments, the second element is the
255 s
, n
, p
, a
, q
, frag
= urlparse(url
)
256 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
266 http:g = <URL:http://a/b/c/g>
267 http: = <URL:http://a/b/c/d>
268 g = <URL:http://a/b/c/g>
269 ./g = <URL:http://a/b/c/g>
270 g/ = <URL:http://a/b/c/g/>
271 /g = <URL:http://a/g>
273 ?y = <URL:http://a/b/c/d?y>
274 g?y = <URL:http://a/b/c/g?y>
275 g?y/./x = <URL:http://a/b/c/g?y/./x>
276 . = <URL:http://a/b/c/>
277 ./ = <URL:http://a/b/c/>
278 .. = <URL:http://a/b/>
279 ../ = <URL:http://a/b/>
280 ../g = <URL:http://a/b/g>
281 ../.. = <URL:http://a/>
282 ../../g = <URL:http://a/g>
283 ../../../g = <URL:http://a/../g>
284 ./../g = <URL:http://a/b/g>
285 ./g/. = <URL:http://a/b/c/g/>
286 /./g = <URL:http://a/./g>
287 g/./h = <URL:http://a/b/c/g/h>
288 g/../h = <URL:http://a/b/c/h>
289 http:g = <URL:http://a/b/c/g>
290 http: = <URL:http://a/b/c/d>
291 http:?y = <URL:http://a/b/c/d?y>
292 http:g?y = <URL:http://a/b/c/g?y>
293 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
307 from cStringIO
import StringIO
309 from StringIO
import StringIO
310 fp
= StringIO(test_input
)
316 parts
= urlparse(url
)
317 print '%-10s : %s' % (url
, parts
)
318 abs = urljoin(base
, url
)
321 wrapped
= '<URL:%s>' % abs
322 print '%-10s = %s' % (url
, wrapped
)
323 if len(words
) == 3 and words
[1] == '=':
324 if wrapped
!= words
[2]:
325 print 'EXPECTED', words
[2], '!!!!!!!!!!'
327 if __name__
== '__main__':