Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
   4 UC Irvine, June 1995.
   5 """
   6
   7 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
   8            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
   9
  10 # A classification of schemes ('' means apply by default)
  11 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
  12                  'wais', 'file', 'https', 'shttp', 'mms',
  13                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
  14 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
  15                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
  16                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
  17                'svn', 'svn+ssh', 'sftp','nfs']
  18 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
  19                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
  20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
  21                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
  22                'mms', '', 'sftp']
  23 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
  24               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
  25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
  26                  'nntp', 'wais', 'https', 'shttp', 'snews',
  27                  'file', 'prospero', '']
  28
  29 # Characters valid in scheme names
  30 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  31                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  32                 '0123456789'
  33                 '+-.')
  34
  35 MAX_CACHE_SIZE = 20
  36 _parse_cache = {}
  37
  38 def clear_cache():
  39     """Clear the parse cache."""
  40     _parse_cache.clear()
  41
  42
  43 class ResultMixin(object):
  44     """Shared methods for the parsed result objects."""
  45
  46     @property
  47     def username(self):
  48         netloc = self.netloc
  49         if "@" in netloc:
  50             userinfo = netloc.rsplit("@", 1)[0]
  51             if ":" in userinfo:
  52                 userinfo = userinfo.split(":", 1)[0]
  53             return userinfo
  54         return None
  55
  56     @property
  57     def password(self):
  58         netloc = self.netloc
  59         if "@" in netloc:
  60             userinfo = netloc.rsplit("@", 1)[0]
  61             if ":" in userinfo:
  62                 return userinfo.split(":", 1)[1]
  63         return None
  64
  65     @property
  66     def hostname(self):
  67         netloc = self.netloc
  68         if "@" in netloc:
  69             netloc = netloc.rsplit("@", 1)[1]
  70         if ":" in netloc:
  71             netloc = netloc.split(":", 1)[0]
  72         return netloc.lower() or None
  73
  74     @property
  75     def port(self):
  76         netloc = self.netloc
  77         if "@" in netloc:
  78             netloc = netloc.rsplit("@", 1)[1]
  79         if ":" in netloc:
  80             port = netloc.split(":", 1)[1]
  81             return int(port, 10)
  82         return None
  83
  84 from collections import namedtuple
  85
  86 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
  87
  88     __slots__ = ()
  89
  90     def geturl(self):
  91         return urlunsplit(self)
  92
  93
  94 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
  95
  96     __slots__ = ()
  97
  98     def geturl(self):
  99         return urlunparse(self)
 100
 101
 102 def urlparse(url, scheme='', allow_fragments=True):
 103     """Parse a URL into 6 components:
 104     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 105     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
 106     Note that we don't break the components up in smaller bits
 107     (e.g. netloc is a single string) and we don't expand % escapes."""
 108     tuple = urlsplit(url, scheme, allow_fragments)
 109     scheme, netloc, url, query, fragment = tuple
 110     if scheme in uses_params and ';' in url:
 111         url, params = _splitparams(url)
 112     else:
 113         params = ''
 114     return ParseResult(scheme, netloc, url, params, query, fragment)
 115
 116 def _splitparams(url):
 117     if '/'  in url:
 118         i = url.find(';', url.rfind('/'))
 119         if i < 0:
 120             return url, ''
 121     else:
 122         i = url.find(';')
 123     return url[:i], url[i+1:]
 124
 125 def _splitnetloc(url, start=0):
 126     delim = len(url)   # position of end of domain part of url, default is end
 127     for c in '/?#':    # look for delimiters; the order is NOT important
 128         wdelim = url.find(c, start)        # find first of this delim
 129         if wdelim >= 0:                    # if found
 130             delim = min(delim, wdelim)     # use earliest delim position
 131     return url[start:delim], url[delim:]   # return (domain, rest)
 132
 133 def urlsplit(url, scheme='', allow_fragments=True):
 134     """Parse a URL into 5 components:
 135     <scheme>://<netloc>/<path>?<query>#<fragment>
 136     Return a 5-tuple: (scheme, netloc, path, query, fragment).
 137     Note that we don't break the components up in smaller bits
 138     (e.g. netloc is a single string) and we don't expand % escapes."""
 139     allow_fragments = bool(allow_fragments)
 140     key = url, scheme, allow_fragments, type(url), type(scheme)
 141     cached = _parse_cache.get(key, None)
 142     if cached:
 143         return cached
 144     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
 145         clear_cache()
 146     netloc = query = fragment = ''
 147     i = url.find(':')
 148     if i > 0:
 149         if url[:i] == 'http': # optimize the common case
 150             scheme = url[:i].lower()
 151             url = url[i+1:]
 152             if url[:2] == '//':
 153                 netloc, url = _splitnetloc(url, 2)
 154             if allow_fragments and '#' in url:
 155                 url, fragment = url.split('#', 1)
 156             if '?' in url:
 157                 url, query = url.split('?', 1)
 158             v = SplitResult(scheme, netloc, url, query, fragment)
 159             _parse_cache[key] = v
 160             return v
 161         for c in url[:i]:
 162             if c not in scheme_chars:
 163                 break
 164         else:
 165             scheme, url = url[:i].lower(), url[i+1:]
 166     if scheme in uses_netloc and url[:2] == '//':
 167         netloc, url = _splitnetloc(url, 2)
 168     if allow_fragments and scheme in uses_fragment and '#' in url:
 169         url, fragment = url.split('#', 1)
 170     if scheme in uses_query and '?' in url:
 171         url, query = url.split('?', 1)
 172     v = SplitResult(scheme, netloc, url, query, fragment)
 173     _parse_cache[key] = v
 174     return v
 175
 176 def urlunparse(data):
 177     """Put a parsed URL back together again.  This may result in a
 178     slightly different, but equivalent URL, if the URL that was parsed
 179     originally had redundant delimiters, e.g. a ? with an empty query
 180     (the draft states that these are equivalent)."""
 181     scheme, netloc, url, params, query, fragment = data
 182     if params:
 183         url = "%s;%s" % (url, params)
 184     return urlunsplit((scheme, netloc, url, query, fragment))
 185
 186 def urlunsplit(data):
 187     scheme, netloc, url, query, fragment = data
 188     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
 189         if url and url[:1] != '/': url = '/' + url
 190         url = '//' + (netloc or '') + url
 191     if scheme:
 192         url = scheme + ':' + url
 193     if query:
 194         url = url + '?' + query
 195     if fragment:
 196         url = url + '#' + fragment
 197     return url
 198
 199 def urljoin(base, url, allow_fragments=True):
 200     """Join a base URL and a possibly relative URL to form an absolute
 201     interpretation of the latter."""
 202     if not base:
 203         return url
 204     if not url:
 205         return base
 206     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 207             urlparse(base, '', allow_fragments)
 208     scheme, netloc, path, params, query, fragment = \
 209             urlparse(url, bscheme, allow_fragments)
 210     if scheme != bscheme or scheme not in uses_relative:
 211         return url
 212     if scheme in uses_netloc:
 213         if netloc:
 214             return urlunparse((scheme, netloc, path,
 215                                params, query, fragment))
 216         netloc = bnetloc
 217     if path[:1] == '/':
 218         return urlunparse((scheme, netloc, path,
 219                            params, query, fragment))
 220     if not path:
 221         path = bpath
 222         if not params:
 223             params = bparams
 224         else:
 225             path = path[:-1]
 226             return urlunparse((scheme, netloc, path,
 227                                 params, query, fragment))
 228         if not query:
 229             query = bquery
 230         return urlunparse((scheme, netloc, path,
 231                            params, query, fragment))
 232     segments = bpath.split('/')[:-1] + path.split('/')
 233     # XXX The stuff below is bogus in various ways...
 234     if segments[-1] == '.':
 235         segments[-1] = ''
 236     while '.' in segments:
 237         segments.remove('.')
 238     while 1:
 239         i = 1
 240         n = len(segments) - 1
 241         while i < n:
 242             if (segments[i] == '..'
 243                 and segments[i-1] not in ('', '..')):
 244                 del segments[i-1:i+1]
 245                 break
 246             i = i+1
 247         else:
 248             break
 249     if segments == ['', '..']:
 250         segments[-1] = ''
 251     elif len(segments) >= 2 and segments[-1] == '..':
 252         segments[-2:] = ['']
 253     return urlunparse((scheme, netloc, '/'.join(segments),
 254                        params, query, fragment))
 255
 256 def urldefrag(url):
 257     """Removes any existing fragment from URL.
 258
 259     Returns a tuple of the defragmented URL and the fragment.  If
 260     the URL contained no fragments, the second element is the
 261     empty string.
 262     """
 263     if '#' in url:
 264         s, n, p, a, q, frag = urlparse(url)
 265         defrag = urlunparse((s, n, p, a, q, ''))
 266         return defrag, frag
 267     else:
 268         return url, ''
 269
 270 # unquote method for parse_qs and parse_qsl
 271 # Cannot use directly from urllib as it would create circular reference.
 272 # urllib uses urlparse methods ( urljoin)
 273
 274 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
 275 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
 276
 277 def unquote(s):
 278     """unquote('abc%20def') -> 'abc def'."""
 279     res = s.split('%')
 280     for i in xrange(1, len(res)):
 281         item = res[i]
 282         try:
 283             res[i] = _hextochr[item[:2]] + item[2:]
 284         except KeyError:
 285             res[i] = '%' + item
 286         except UnicodeDecodeError:
 287             res[i] = unichr(int(item[:2], 16)) + item[2:]
 288     return "".join(res)
 289
 290 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
 291     """Parse a query given as a string argument.
 292
 293         Arguments:
 294
 295         qs: URL-encoded query string to be parsed
 296
 297         keep_blank_values: flag indicating whether blank values in
 298             URL encoded queries should be treated as blank strings.
 299             A true value indicates that blanks should be retained as
 300             blank strings.  The default false value indicates that
 301             blank values are to be ignored and treated as if they were
 302             not included.
 303
 304         strict_parsing: flag indicating what to do with parsing errors.
 305             If false (the default), errors are silently ignored.
 306             If true, errors raise a ValueError exception.
 307     """
 308     dict = {}
 309     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
 310         if name in dict:
 311             dict[name].append(value)
 312         else:
 313             dict[name] = [value]
 314     return dict
 315
 316 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
 317     """Parse a query given as a string argument.
 318
 319     Arguments:
 320
 321     qs: URL-encoded query string to be parsed
 322
 323     keep_blank_values: flag indicating whether blank values in
 324         URL encoded queries should be treated as blank strings.  A
 325         true value indicates that blanks should be retained as blank
 326         strings.  The default false value indicates that blank values
 327         are to be ignored and treated as if they were  not included.
 328
 329     strict_parsing: flag indicating what to do with parsing errors. If
 330         false (the default), errors are silently ignored. If true,
 331         errors raise a ValueError exception.
 332
 333     Returns a list, as G-d intended.
 334     """
 335     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 336     r = []
 337     for name_value in pairs:
 338         if not name_value and not strict_parsing:
 339             continue
 340         nv = name_value.split('=', 1)
 341         if len(nv) != 2:
 342             if strict_parsing:
 343                 raise ValueError, "bad query field: %r" % (name_value,)
 344             # Handle case of a control-name with no equal sign
 345             if keep_blank_values:
 346                 nv.append('')
 347             else:
 348                 continue
 349         if len(nv[1]) or keep_blank_values:
 350             name = unquote(nv[0].replace('+', ' '))
 351             value = unquote(nv[1].replace('+', ' '))
 352             r.append((name, value))
 353
 354     return r
 355
 356
 357 test_input = """
 358       http://a/b/c/d
 359
 360       g:h        = <URL:g:h>
 361       http:g     = <URL:http://a/b/c/g>
 362       http:      = <URL:http://a/b/c/d>
 363       g          = <URL:http://a/b/c/g>
 364       ./g        = <URL:http://a/b/c/g>
 365       g/         = <URL:http://a/b/c/g/>
 366       /g         = <URL:http://a/g>
 367       //g        = <URL:http://g>
 368       ?y         = <URL:http://a/b/c/d?y>
 369       g?y        = <URL:http://a/b/c/g?y>
 370       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 371       .          = <URL:http://a/b/c/>
 372       ./         = <URL:http://a/b/c/>
 373       ..         = <URL:http://a/b/>
 374       ../        = <URL:http://a/b/>
 375       ../g       = <URL:http://a/b/g>
 376       ../..      = <URL:http://a/>
 377       ../../g    = <URL:http://a/g>
 378       ../../../g = <URL:http://a/../g>
 379       ./../g     = <URL:http://a/b/g>
 380       ./g/.      = <URL:http://a/b/c/g/>
 381       /./g       = <URL:http://a/./g>
 382       g/./h      = <URL:http://a/b/c/g/h>
 383       g/../h     = <URL:http://a/b/c/h>
 384       http:g     = <URL:http://a/b/c/g>
 385       http:      = <URL:http://a/b/c/d>
 386       http:?y         = <URL:http://a/b/c/d?y>
 387       http:g?y        = <URL:http://a/b/c/g?y>
 388       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 389 """
 390
 391 def test():
 392     import sys
 393     base = ''
 394     if sys.argv[1:]:
 395         fn = sys.argv[1]
 396         if fn == '-':
 397             fp = sys.stdin
 398         else:
 399             fp = open(fn)
 400     else:
 401         try:
 402             from cStringIO import StringIO
 403         except ImportError:
 404             from StringIO import StringIO
 405         fp = StringIO(test_input)
 406     for line in fp:
 407         words = line.split()
 408         if not words:
 409             continue
 410         url = words[0]
 411         parts = urlparse(url)
 412         print '%-10s : %s' % (url, parts)
 413         abs = urljoin(base, url)
 414         if not base:
 415             base = abs
 416         wrapped = '<URL:%s>' % abs
 417         print '%-10s = %s' % (url, wrapped)
 418         if len(words) == 3 and words[1] == '=':
 419             if wrapped != words[2]:
 420                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 421
 422 if __name__ == '__main__':
 423     test()