Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
   4 UC Irvine, June 1995.
   5 """
   6
   7 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
   8            "urlsplit", "urlunsplit"]
   9
  10 # A classification of schemes ('' means apply by default)
  11 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
  12                  'wais', 'file', 'https', 'shttp', 'mms',
  13                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
  14 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
  15                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
  16                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
  17                'svn', 'svn+ssh', 'sftp']
  18 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
  19                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
  20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
  21                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
  22                'mms', '', 'sftp']
  23 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
  24               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
  25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
  26                  'nntp', 'wais', 'https', 'shttp', 'snews',
  27                  'file', 'prospero', '']
  28
  29 # Characters valid in scheme names
  30 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  31                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  32                 '0123456789'
  33                 '+-.')
  34
  35 MAX_CACHE_SIZE = 20
  36 _parse_cache = {}
  37
  38 def clear_cache():
  39     """Clear the parse cache."""
  40     global _parse_cache
  41     _parse_cache = {}
  42
  43
  44 class BaseResult(tuple):
  45     """Base class for the parsed result objects.
  46
  47     This provides the attributes shared by the two derived result
  48     objects as read-only properties.  The derived classes are
  49     responsible for checking the right number of arguments were
  50     supplied to the constructor.
  51
  52     """
  53
  54     __slots__ = ()
  55
  56     # Attributes that access the basic components of the URL:
  57
  58     @property
  59     def scheme(self):
  60         return self[0]
  61
  62     @property
  63     def netloc(self):
  64         return self[1]
  65
  66     @property
  67     def path(self):
  68         return self[2]
  69
  70     @property
  71     def query(self):
  72         return self[-2]
  73
  74     @property
  75     def fragment(self):
  76         return self[-1]
  77
  78     # Additional attributes that provide access to parsed-out portions
  79     # of the netloc:
  80
  81     @property
  82     def username(self):
  83         netloc = self.netloc
  84         if "@" in netloc:
  85             userinfo = netloc.split("@", 1)[0]
  86             if ":" in userinfo:
  87                 userinfo = userinfo.split(":", 1)[0]
  88             return userinfo
  89         return None
  90
  91     @property
  92     def password(self):
  93         netloc = self.netloc
  94         if "@" in netloc:
  95             userinfo = netloc.split("@", 1)[0]
  96             if ":" in userinfo:
  97                 return userinfo.split(":", 1)[1]
  98         return None
  99
 100     @property
 101     def hostname(self):
 102         netloc = self.netloc
 103         if "@" in netloc:
 104             netloc = netloc.split("@", 1)[1]
 105         if ":" in netloc:
 106             netloc = netloc.split(":", 1)[0]
 107         return netloc.lower() or None
 108
 109     @property
 110     def port(self):
 111         netloc = self.netloc
 112         if "@" in netloc:
 113             netloc = netloc.split("@", 1)[1]
 114         if ":" in netloc:
 115             port = netloc.split(":", 1)[1]
 116             return int(port, 10)
 117         return None
 118
 119
 120 class SplitResult(BaseResult):
 121
 122     __slots__ = ()
 123
 124     def __new__(cls, scheme, netloc, path, query, fragment):
 125         return BaseResult.__new__(
 126             cls, (scheme, netloc, path, query, fragment))
 127
 128     def geturl(self):
 129         return urlunsplit(self)
 130
 131
 132 class ParseResult(BaseResult):
 133
 134     __slots__ = ()
 135
 136     def __new__(cls, scheme, netloc, path, params, query, fragment):
 137         return BaseResult.__new__(
 138             cls, (scheme, netloc, path, params, query, fragment))
 139
 140     @property
 141     def params(self):
 142         return self[3]
 143
 144     def geturl(self):
 145         return urlunparse(self)
 146
 147
 148 def urlparse(url, scheme='', allow_fragments=True):
 149     """Parse a URL into 6 components:
 150     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 151     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
 152     Note that we don't break the components up in smaller bits
 153     (e.g. netloc is a single string) and we don't expand % escapes."""
 154     tuple = urlsplit(url, scheme, allow_fragments)
 155     scheme, netloc, url, query, fragment = tuple
 156     if scheme in uses_params and ';' in url:
 157         url, params = _splitparams(url)
 158     else:
 159         params = ''
 160     return ParseResult(scheme, netloc, url, params, query, fragment)
 161
 162 def _splitparams(url):
 163     if '/'  in url:
 164         i = url.find(';', url.rfind('/'))
 165         if i < 0:
 166             return url, ''
 167     else:
 168         i = url.find(';')
 169     return url[:i], url[i+1:]
 170
 171 def _splitnetloc(url, start=0):
 172     for c in '/?#': # the order is important!
 173         delim = url.find(c, start)
 174         if delim >= 0:
 175             break
 176     else:
 177         delim = len(url)
 178     return url[start:delim], url[delim:]
 179
 180 def urlsplit(url, scheme='', allow_fragments=True):
 181     """Parse a URL into 5 components:
 182     <scheme>://<netloc>/<path>?<query>#<fragment>
 183     Return a 5-tuple: (scheme, netloc, path, query, fragment).
 184     Note that we don't break the components up in smaller bits
 185     (e.g. netloc is a single string) and we don't expand % escapes."""
 186     allow_fragments = bool(allow_fragments)
 187     key = url, scheme, allow_fragments
 188     cached = _parse_cache.get(key, None)
 189     if cached:
 190         return cached
 191     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
 192         clear_cache()
 193     netloc = query = fragment = ''
 194     i = url.find(':')
 195     if i > 0:
 196         if url[:i] == 'http': # optimize the common case
 197             scheme = url[:i].lower()
 198             url = url[i+1:]
 199             if url[:2] == '//':
 200                 netloc, url = _splitnetloc(url, 2)
 201             if allow_fragments and '#' in url:
 202                 url, fragment = url.split('#', 1)
 203             if '?' in url:
 204                 url, query = url.split('?', 1)
 205             v = SplitResult(scheme, netloc, url, query, fragment)
 206             _parse_cache[key] = v
 207             return v
 208         for c in url[:i]:
 209             if c not in scheme_chars:
 210                 break
 211         else:
 212             scheme, url = url[:i].lower(), url[i+1:]
 213     if scheme in uses_netloc and url[:2] == '//':
 214         netloc, url = _splitnetloc(url, 2)
 215     if allow_fragments and scheme in uses_fragment and '#' in url:
 216         url, fragment = url.split('#', 1)
 217     if scheme in uses_query and '?' in url:
 218         url, query = url.split('?', 1)
 219     v = SplitResult(scheme, netloc, url, query, fragment)
 220     _parse_cache[key] = v
 221     return v
 222
 223 def urlunparse((scheme, netloc, url, params, query, fragment)):
 224     """Put a parsed URL back together again.  This may result in a
 225     slightly different, but equivalent URL, if the URL that was parsed
 226     originally had redundant delimiters, e.g. a ? with an empty query
 227     (the draft states that these are equivalent)."""
 228     if params:
 229         url = "%s;%s" % (url, params)
 230     return urlunsplit((scheme, netloc, url, query, fragment))
 231
 232 def urlunsplit((scheme, netloc, url, query, fragment)):
 233     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
 234         if url and url[:1] != '/': url = '/' + url
 235         url = '//' + (netloc or '') + url
 236     if scheme:
 237         url = scheme + ':' + url
 238     if query:
 239         url = url + '?' + query
 240     if fragment:
 241         url = url + '#' + fragment
 242     return url
 243
 244 def urljoin(base, url, allow_fragments=True):
 245     """Join a base URL and a possibly relative URL to form an absolute
 246     interpretation of the latter."""
 247     if not base:
 248         return url
 249     if not url:
 250         return base
 251     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 252             urlparse(base, '', allow_fragments)
 253     scheme, netloc, path, params, query, fragment = \
 254             urlparse(url, bscheme, allow_fragments)
 255     if scheme != bscheme or scheme not in uses_relative:
 256         return url
 257     if scheme in uses_netloc:
 258         if netloc:
 259             return urlunparse((scheme, netloc, path,
 260                                params, query, fragment))
 261         netloc = bnetloc
 262     if path[:1] == '/':
 263         return urlunparse((scheme, netloc, path,
 264                            params, query, fragment))
 265     if not (path or params or query):
 266         return urlunparse((scheme, netloc, bpath,
 267                            bparams, bquery, fragment))
 268     segments = bpath.split('/')[:-1] + path.split('/')
 269     # XXX The stuff below is bogus in various ways...
 270     if segments[-1] == '.':
 271         segments[-1] = ''
 272     while '.' in segments:
 273         segments.remove('.')
 274     while 1:
 275         i = 1
 276         n = len(segments) - 1
 277         while i < n:
 278             if (segments[i] == '..'
 279                 and segments[i-1] not in ('', '..')):
 280                 del segments[i-1:i+1]
 281                 break
 282             i = i+1
 283         else:
 284             break
 285     if segments == ['', '..']:
 286         segments[-1] = ''
 287     elif len(segments) >= 2 and segments[-1] == '..':
 288         segments[-2:] = ['']
 289     return urlunparse((scheme, netloc, '/'.join(segments),
 290                        params, query, fragment))
 291
 292 def urldefrag(url):
 293     """Removes any existing fragment from URL.
 294
 295     Returns a tuple of the defragmented URL and the fragment.  If
 296     the URL contained no fragments, the second element is the
 297     empty string.
 298     """
 299     if '#' in url:
 300         s, n, p, a, q, frag = urlparse(url)
 301         defrag = urlunparse((s, n, p, a, q, ''))
 302         return defrag, frag
 303     else:
 304         return url, ''
 305
 306
 307 test_input = """
 308       http://a/b/c/d
 309
 310       g:h        = <URL:g:h>
 311       http:g     = <URL:http://a/b/c/g>
 312       http:      = <URL:http://a/b/c/d>
 313       g          = <URL:http://a/b/c/g>
 314       ./g        = <URL:http://a/b/c/g>
 315       g/         = <URL:http://a/b/c/g/>
 316       /g         = <URL:http://a/g>
 317       //g        = <URL:http://g>
 318       ?y         = <URL:http://a/b/c/d?y>
 319       g?y        = <URL:http://a/b/c/g?y>
 320       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 321       .          = <URL:http://a/b/c/>
 322       ./         = <URL:http://a/b/c/>
 323       ..         = <URL:http://a/b/>
 324       ../        = <URL:http://a/b/>
 325       ../g       = <URL:http://a/b/g>
 326       ../..      = <URL:http://a/>
 327       ../../g    = <URL:http://a/g>
 328       ../../../g = <URL:http://a/../g>
 329       ./../g     = <URL:http://a/b/g>
 330       ./g/.      = <URL:http://a/b/c/g/>
 331       /./g       = <URL:http://a/./g>
 332       g/./h      = <URL:http://a/b/c/g/h>
 333       g/../h     = <URL:http://a/b/c/h>
 334       http:g     = <URL:http://a/b/c/g>
 335       http:      = <URL:http://a/b/c/d>
 336       http:?y         = <URL:http://a/b/c/d?y>
 337       http:g?y        = <URL:http://a/b/c/g?y>
 338       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 339 """
 340
 341 def test():
 342     import sys
 343     base = ''
 344     if sys.argv[1:]:
 345         fn = sys.argv[1]
 346         if fn == '-':
 347             fp = sys.stdin
 348         else:
 349             fp = open(fn)
 350     else:
 351         try:
 352             from cStringIO import StringIO
 353         except ImportError:
 354             from StringIO import StringIO
 355         fp = StringIO(test_input)
 356     while 1:
 357         line = fp.readline()
 358         if not line: break
 359         words = line.split()
 360         if not words:
 361             continue
 362         url = words[0]
 363         parts = urlparse(url)
 364         print '%-10s : %s' % (url, parts)
 365         abs = urljoin(base, url)
 366         if not base:
 367             base = abs
 368         wrapped = '<URL:%s>' % abs
 369         print '%-10s = %s' % (url, wrapped)
 370         if len(words) == 3 and words[1] == '=':
 371             if wrapped != words[2]:
 372                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 373
 374 if __name__ == '__main__':
 375     test()