Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 urlparse module is based upon the following RFC specifications.
   4
   5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
   6 and L.  Masinter, January 2005.
   7
   8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
   9 and L.Masinter, December 1999.
  10
  11 RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
  12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
  13
  14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
  15
  16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
  17 1995.
  18
  19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
  20 McCahill, December 1994
  21
  22 RFC 3986 is considered the current standard and any future changes to
  23 urlparse module should conform with it.  The urlparse module is
  24 currently not entirely compliant with this RFC due to defacto
  25 scenarios for parsing, and for backward compatibility purposes, some
  26 parsing quirks from older RFCs are retained. The testcases in
  27 test_urlparse.py provides a good indicator of parsing behavior.
  28
  29 """
  30
  31 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
  32            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
  33
  34 # A classification of schemes ('' means apply by default)
  35 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
  36                  'wais', 'file', 'https', 'shttp', 'mms',
  37                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
  38 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
  39                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
  40                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
  41                'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
  42 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
  43                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
  44 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
  45                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
  46                'mms', '', 'sftp']
  47 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
  48               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
  49 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
  50                  'nntp', 'wais', 'https', 'shttp', 'snews',
  51                  'file', 'prospero', '']
  52
  53 # Characters valid in scheme names
  54 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  55                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  56                 '0123456789'
  57                 '+-.')
  58
  59 MAX_CACHE_SIZE = 20
  60 _parse_cache = {}
  61
  62 def clear_cache():
  63     """Clear the parse cache."""
  64     _parse_cache.clear()
  65
  66
  67 class ResultMixin(object):
  68     """Shared methods for the parsed result objects."""
  69
  70     @property
  71     def username(self):
  72         netloc = self.netloc
  73         if "@" in netloc:
  74             userinfo = netloc.rsplit("@", 1)[0]
  75             if ":" in userinfo:
  76                 userinfo = userinfo.split(":", 1)[0]
  77             return userinfo
  78         return None
  79
  80     @property
  81     def password(self):
  82         netloc = self.netloc
  83         if "@" in netloc:
  84             userinfo = netloc.rsplit("@", 1)[0]
  85             if ":" in userinfo:
  86                 return userinfo.split(":", 1)[1]
  87         return None
  88
  89     @property
  90     def hostname(self):
  91         netloc = self.netloc.split('@')[-1]
  92         if '[' in netloc and ']' in netloc:
  93             return netloc.split(']')[0][1:].lower()
  94         elif ':' in netloc:
  95             return netloc.split(':')[0].lower()
  96         elif netloc == '':
  97             return None
  98         else:
  99             return netloc.lower()
 100
 101     @property
 102     def port(self):
 103         netloc = self.netloc.split('@')[-1].split(']')[-1]
 104         if ':' in netloc:
 105             port = netloc.split(':')[1]
 106             return int(port, 10)
 107         else:
 108             return None
 109
 110 from collections import namedtuple
 111
 112 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
 113
 114     __slots__ = ()
 115
 116     def geturl(self):
 117         return urlunsplit(self)
 118
 119
 120 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
 121
 122     __slots__ = ()
 123
 124     def geturl(self):
 125         return urlunparse(self)
 126
 127
 128 def urlparse(url, scheme='', allow_fragments=True):
 129     """Parse a URL into 6 components:
 130     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 131     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
 132     Note that we don't break the components up in smaller bits
 133     (e.g. netloc is a single string) and we don't expand % escapes."""
 134     tuple = urlsplit(url, scheme, allow_fragments)
 135     scheme, netloc, url, query, fragment = tuple
 136     if scheme in uses_params and ';' in url:
 137         url, params = _splitparams(url)
 138     else:
 139         params = ''
 140     return ParseResult(scheme, netloc, url, params, query, fragment)
 141
 142 def _splitparams(url):
 143     if '/'  in url:
 144         i = url.find(';', url.rfind('/'))
 145         if i < 0:
 146             return url, ''
 147     else:
 148         i = url.find(';')
 149     return url[:i], url[i+1:]
 150
 151 def _splitnetloc(url, start=0):
 152     delim = len(url)   # position of end of domain part of url, default is end
 153     for c in '/?#':    # look for delimiters; the order is NOT important
 154         wdelim = url.find(c, start)        # find first of this delim
 155         if wdelim >= 0:                    # if found
 156             delim = min(delim, wdelim)     # use earliest delim position
 157     return url[start:delim], url[delim:]   # return (domain, rest)
 158
 159 def urlsplit(url, scheme='', allow_fragments=True):
 160     """Parse a URL into 5 components:
 161     <scheme>://<netloc>/<path>?<query>#<fragment>
 162     Return a 5-tuple: (scheme, netloc, path, query, fragment).
 163     Note that we don't break the components up in smaller bits
 164     (e.g. netloc is a single string) and we don't expand % escapes."""
 165     allow_fragments = bool(allow_fragments)
 166     key = url, scheme, allow_fragments, type(url), type(scheme)
 167     cached = _parse_cache.get(key, None)
 168     if cached:
 169         return cached
 170     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
 171         clear_cache()
 172     netloc = query = fragment = ''
 173     i = url.find(':')
 174     if i > 0:
 175         if url[:i] == 'http': # optimize the common case
 176             scheme = url[:i].lower()
 177             url = url[i+1:]
 178             if url[:2] == '//':
 179                 netloc, url = _splitnetloc(url, 2)
 180                 if (('[' in netloc and ']' not in netloc) or
 181                         (']' in netloc and '[' not in netloc)):
 182                     raise ValueError("Invalid IPv6 URL")
 183             if allow_fragments and '#' in url:
 184                 url, fragment = url.split('#', 1)
 185             if '?' in url:
 186                 url, query = url.split('?', 1)
 187             v = SplitResult(scheme, netloc, url, query, fragment)
 188             _parse_cache[key] = v
 189             return v
 190         for c in url[:i]:
 191             if c not in scheme_chars:
 192                 break
 193         else:
 194             scheme, url = url[:i].lower(), url[i+1:]
 195
 196     if url[:2] == '//':
 197         netloc, url = _splitnetloc(url, 2)
 198         if (('[' in netloc and ']' not in netloc) or
 199                 (']' in netloc and '[' not in netloc)):
 200             raise ValueError("Invalid IPv6 URL")
 201     if allow_fragments and scheme in uses_fragment and '#' in url:
 202         url, fragment = url.split('#', 1)
 203     if scheme in uses_query and '?' in url:
 204         url, query = url.split('?', 1)
 205     v = SplitResult(scheme, netloc, url, query, fragment)
 206     _parse_cache[key] = v
 207     return v
 208
 209 def urlunparse(data):
 210     """Put a parsed URL back together again.  This may result in a
 211     slightly different, but equivalent URL, if the URL that was parsed
 212     originally had redundant delimiters, e.g. a ? with an empty query
 213     (the draft states that these are equivalent)."""
 214     scheme, netloc, url, params, query, fragment = data
 215     if params:
 216         url = "%s;%s" % (url, params)
 217     return urlunsplit((scheme, netloc, url, query, fragment))
 218
 219 def urlunsplit(data):
 220     """Combine the elements of a tuple as returned by urlsplit() into a
 221     complete URL as a string. The data argument can be any five-item iterable.
 222     This may result in a slightly different, but equivalent URL, if the URL that
 223     was parsed originally had unnecessary delimiters (for example, a ? with an
 224     empty query; the RFC states that these are equivalent)."""
 225     scheme, netloc, url, query, fragment = data
 226     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
 227         if url and url[:1] != '/': url = '/' + url
 228         url = '//' + (netloc or '') + url
 229     if scheme:
 230         url = scheme + ':' + url
 231     if query:
 232         url = url + '?' + query
 233     if fragment:
 234         url = url + '#' + fragment
 235     return url
 236
 237 def urljoin(base, url, allow_fragments=True):
 238     """Join a base URL and a possibly relative URL to form an absolute
 239     interpretation of the latter."""
 240     if not base:
 241         return url
 242     if not url:
 243         return base
 244     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 245             urlparse(base, '', allow_fragments)
 246     scheme, netloc, path, params, query, fragment = \
 247             urlparse(url, bscheme, allow_fragments)
 248     if scheme != bscheme or scheme not in uses_relative:
 249         return url
 250     if scheme in uses_netloc:
 251         if netloc:
 252             return urlunparse((scheme, netloc, path,
 253                                params, query, fragment))
 254         netloc = bnetloc
 255     if path[:1] == '/':
 256         return urlunparse((scheme, netloc, path,
 257                            params, query, fragment))
 258     if not path:
 259         path = bpath
 260         if not params:
 261             params = bparams
 262         else:
 263             path = path[:-1]
 264             return urlunparse((scheme, netloc, path,
 265                                 params, query, fragment))
 266         if not query:
 267             query = bquery
 268         return urlunparse((scheme, netloc, path,
 269                            params, query, fragment))
 270     segments = bpath.split('/')[:-1] + path.split('/')
 271     # XXX The stuff below is bogus in various ways...
 272     if segments[-1] == '.':
 273         segments[-1] = ''
 274     while '.' in segments:
 275         segments.remove('.')
 276     while 1:
 277         i = 1
 278         n = len(segments) - 1
 279         while i < n:
 280             if (segments[i] == '..'
 281                 and segments[i-1] not in ('', '..')):
 282                 del segments[i-1:i+1]
 283                 break
 284             i = i+1
 285         else:
 286             break
 287     if segments == ['', '..']:
 288         segments[-1] = ''
 289     elif len(segments) >= 2 and segments[-1] == '..':
 290         segments[-2:] = ['']
 291     return urlunparse((scheme, netloc, '/'.join(segments),
 292                        params, query, fragment))
 293
 294 def urldefrag(url):
 295     """Removes any existing fragment from URL.
 296
 297     Returns a tuple of the defragmented URL and the fragment.  If
 298     the URL contained no fragments, the second element is the
 299     empty string.
 300     """
 301     if '#' in url:
 302         s, n, p, a, q, frag = urlparse(url)
 303         defrag = urlunparse((s, n, p, a, q, ''))
 304         return defrag, frag
 305     else:
 306         return url, ''
 307
 308 # unquote method for parse_qs and parse_qsl
 309 # Cannot use directly from urllib as it would create a circular reference
 310 # because urllib uses urlparse methods (urljoin).  If you update this function,
 311 # update it also in urllib.  This code duplication does not existin in Python3.
 312
 313 _hexdig = '0123456789ABCDEFabcdef'
 314 _hextochr = dict((a+b, chr(int(a+b,16)))
 315                  for a in _hexdig for b in _hexdig)
 316
 317 def unquote(s):
 318     """unquote('abc%20def') -> 'abc def'."""
 319     res = s.split('%')
 320     # fastpath
 321     if len(res) == 1:
 322         return s
 323     s = res[0]
 324     for item in res[1:]:
 325         try:
 326             s += _hextochr[item[:2]] + item[2:]
 327         except KeyError:
 328             s += '%' + item
 329         except UnicodeDecodeError:
 330             s += unichr(int(item[:2], 16)) + item[2:]
 331     return s
 332
 333 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
 334     """Parse a query given as a string argument.
 335
 336         Arguments:
 337
 338         qs: URL-encoded query string to be parsed
 339
 340         keep_blank_values: flag indicating whether blank values in
 341             URL encoded queries should be treated as blank strings.
 342             A true value indicates that blanks should be retained as
 343             blank strings.  The default false value indicates that
 344             blank values are to be ignored and treated as if they were
 345             not included.
 346
 347         strict_parsing: flag indicating what to do with parsing errors.
 348             If false (the default), errors are silently ignored.
 349             If true, errors raise a ValueError exception.
 350     """
 351     dict = {}
 352     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
 353         if name in dict:
 354             dict[name].append(value)
 355         else:
 356             dict[name] = [value]
 357     return dict
 358
 359 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
 360     """Parse a query given as a string argument.
 361
 362     Arguments:
 363
 364     qs: URL-encoded query string to be parsed
 365
 366     keep_blank_values: flag indicating whether blank values in
 367         URL encoded queries should be treated as blank strings.  A
 368         true value indicates that blanks should be retained as blank
 369         strings.  The default false value indicates that blank values
 370         are to be ignored and treated as if they were  not included.
 371
 372     strict_parsing: flag indicating what to do with parsing errors. If
 373         false (the default), errors are silently ignored. If true,
 374         errors raise a ValueError exception.
 375
 376     Returns a list, as G-d intended.
 377     """
 378     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 379     r = []
 380     for name_value in pairs:
 381         if not name_value and not strict_parsing:
 382             continue
 383         nv = name_value.split('=', 1)
 384         if len(nv) != 2:
 385             if strict_parsing:
 386                 raise ValueError, "bad query field: %r" % (name_value,)
 387             # Handle case of a control-name with no equal sign
 388             if keep_blank_values:
 389                 nv.append('')
 390             else:
 391                 continue
 392         if len(nv[1]) or keep_blank_values:
 393             name = unquote(nv[0].replace('+', ' '))
 394             value = unquote(nv[1].replace('+', ' '))
 395             r.append((name, value))
 396
 397     return r
 398
 399
 400 test_input = """
 401       http://a/b/c/d
 402
 403       g:h        = <URL:g:h>
 404       http:g     = <URL:http://a/b/c/g>
 405       http:      = <URL:http://a/b/c/d>
 406       g          = <URL:http://a/b/c/g>
 407       ./g        = <URL:http://a/b/c/g>
 408       g/         = <URL:http://a/b/c/g/>
 409       /g         = <URL:http://a/g>
 410       //g        = <URL:http://g>
 411       ?y         = <URL:http://a/b/c/d?y>
 412       g?y        = <URL:http://a/b/c/g?y>
 413       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 414       .          = <URL:http://a/b/c/>
 415       ./         = <URL:http://a/b/c/>
 416       ..         = <URL:http://a/b/>
 417       ../        = <URL:http://a/b/>
 418       ../g       = <URL:http://a/b/g>
 419       ../..      = <URL:http://a/>
 420       ../../g    = <URL:http://a/g>
 421       ../../../g = <URL:http://a/../g>
 422       ./../g     = <URL:http://a/b/g>
 423       ./g/.      = <URL:http://a/b/c/g/>
 424       /./g       = <URL:http://a/./g>
 425       g/./h      = <URL:http://a/b/c/g/h>
 426       g/../h     = <URL:http://a/b/c/h>
 427       http:g     = <URL:http://a/b/c/g>
 428       http:      = <URL:http://a/b/c/d>
 429       http:?y         = <URL:http://a/b/c/d?y>
 430       http:g?y        = <URL:http://a/b/c/g?y>
 431       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 432 """
 433
 434 def test():
 435     import sys
 436     base = ''
 437     if sys.argv[1:]:
 438         fn = sys.argv[1]
 439         if fn == '-':
 440             fp = sys.stdin
 441         else:
 442             fp = open(fn)
 443     else:
 444         try:
 445             from cStringIO import StringIO
 446         except ImportError:
 447             from StringIO import StringIO
 448         fp = StringIO(test_input)
 449     for line in fp:
 450         words = line.split()
 451         if not words:
 452             continue
 453         url = words[0]
 454         parts = urlparse(url)
 455         print '%-10s : %s' % (url, parts)
 456         abs = urljoin(base, url)
 457         if not base:
 458             base = abs
 459         wrapped = '<URL:%s>' % abs
 460         print '%-10s = %s' % (url, wrapped)
 461         if len(words) == 3 and words[1] == '=':
 462             if wrapped != words[2]:
 463                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 464
 465 if __name__ == '__main__':
 466     test()