Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15 deals with digest authentication.
  16
  17 urlopen(url, data=None) -- basic usage is the same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- function that creates a new OpenerDirector instance.
  25 will install the default handlers.  accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  if one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- an object that encapsulates the state of a request.  the
  36 state can be a simple as the URL.  it can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError-- a subclass of IOError, individual protocols have their own
  43 specific subclass
  44
  45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password('realm', 'host', 'username', 'password')
  59
  60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  61
  62 # build a new opener that adds authentication and caching FTP handlers
  63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  64
  65 # install it
  66 urllib2.install_opener(opener)
  67
  68 f = urllib2.urlopen('http://www.python.org/')
  69
  70
  71 """
  72
  73 # XXX issues:
  74 # If an authentication error handler that tries to perform
  75 # authentication for some reason but fails, how should the error be
  76 # signalled?  The client needs to know the HTTP error code.  But if
  77 # the handler knows that the problem was, e.g., that it didn't know
  78 # that hash algo that requested in the challenge, it would be good to
  79 # pass that information along to the client, too.
  80 # ftp errors aren't handled cleanly
  81 # check digest against correct (i.e. non-apache) implementation
  82
  83 # Possible extensions:
  84 # complex proxies  XXX not sure what exactly was meant by this
  85 # abstract factory for opener
  86
  87 import base64
  88 import hashlib
  89 import httplib
  90 import mimetools
  91 import os
  92 import posixpath
  93 import random
  94 import re
  95 import socket
  96 import sys
  97 import time
  98 import urlparse
  99 import bisect
 100
 101 try:
 102     from cStringIO import StringIO
 103 except ImportError:
 104     from StringIO import StringIO
 105
 106 from urllib import (unwrap, unquote, splittype, splithost, quote,
 107      addinfourl, splitport, splitgophertype, splitquery,
 108      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
 109
 110 # support for FileHandler, proxies via environment variables
 111 from urllib import localhost, url2pathname, getproxies
 112
 113 # used in User-Agent header sent
 114 __version__ = sys.version[:3]
 115
 116 _opener = None
 117 def urlopen(url, data=None):
 118     global _opener
 119     if _opener is None:
 120         _opener = build_opener()
 121     return _opener.open(url, data)
 122
 123 def install_opener(opener):
 124     global _opener
 125     _opener = opener
 126
 127 # do these error classes make sense?
 128 # make sure all of the IOError stuff is overridden.  we just want to be
 129 # subtypes.
 130
 131 class URLError(IOError):
 132     # URLError is a sub-type of IOError, but it doesn't share any of
 133     # the implementation.  need to override __init__ and __str__.
 134     # It sets self.args for compatibility with other EnvironmentError
 135     # subclasses, but args doesn't have the typical format with errno in
 136     # slot 0 and strerror in slot 1.  This may be better than nothing.
 137     def __init__(self, reason):
 138         self.args = reason,
 139         self.reason = reason
 140
 141     def __str__(self):
 142         return '<urlopen error %s>' % self.reason
 143
 144 class HTTPError(URLError, addinfourl):
 145     """Raised when HTTP error occurs, but also acts like non-error return"""
 146     __super_init = addinfourl.__init__
 147
 148     def __init__(self, url, code, msg, hdrs, fp):
 149         self.code = code
 150         self.msg = msg
 151         self.hdrs = hdrs
 152         self.fp = fp
 153         self.filename = url
 154         # The addinfourl classes depend on fp being a valid file
 155         # object.  In some cases, the HTTPError may not have a valid
 156         # file object.  If this happens, the simplest workaround is to
 157         # not initialize the base classes.
 158         if fp is not None:
 159             self.__super_init(fp, hdrs, url)
 160
 161     def __str__(self):
 162         return 'HTTP Error %s: %s' % (self.code, self.msg)
 163
 164 class GopherError(URLError):
 165     pass
 166
 167 # copied from cookielib.py
 168 _cut_port_re = re.compile(r":\d+$")
 169 def request_host(request):
 170     """Return request-host, as defined by RFC 2965.
 171
 172     Variation from RFC: returned value is lowercased, for convenient
 173     comparison.
 174
 175     """
 176     url = request.get_full_url()
 177     host = urlparse.urlparse(url)[1]
 178     if host == "":
 179         host = request.get_header("Host", "")
 180
 181     # remove port, if present
 182     host = _cut_port_re.sub("", host, 1)
 183     return host.lower()
 184
 185 class Request:
 186
 187     def __init__(self, url, data=None, headers={},
 188                  origin_req_host=None, unverifiable=False):
 189         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 190         self.__original = unwrap(url)
 191         self.type = None
 192         # self.__r_type is what's left after doing the splittype
 193         self.host = None
 194         self.port = None
 195         self.data = data
 196         self.headers = {}
 197         for key, value in headers.items():
 198             self.add_header(key, value)
 199         self.unredirected_hdrs = {}
 200         if origin_req_host is None:
 201             origin_req_host = request_host(self)
 202         self.origin_req_host = origin_req_host
 203         self.unverifiable = unverifiable
 204
 205     def __getattr__(self, attr):
 206         # XXX this is a fallback mechanism to guard against these
 207         # methods getting called in a non-standard order.  this may be
 208         # too complicated and/or unnecessary.
 209         # XXX should the __r_XXX attributes be public?
 210         if attr[:12] == '_Request__r_':
 211             name = attr[12:]
 212             if hasattr(Request, 'get_' + name):
 213                 getattr(self, 'get_' + name)()
 214                 return getattr(self, attr)
 215         raise AttributeError, attr
 216
 217     def get_method(self):
 218         if self.has_data():
 219             return "POST"
 220         else:
 221             return "GET"
 222
 223     # XXX these helper methods are lame
 224
 225     def add_data(self, data):
 226         self.data = data
 227
 228     def has_data(self):
 229         return self.data is not None
 230
 231     def get_data(self):
 232         return self.data
 233
 234     def get_full_url(self):
 235         return self.__original
 236
 237     def get_type(self):
 238         if self.type is None:
 239             self.type, self.__r_type = splittype(self.__original)
 240             if self.type is None:
 241                 raise ValueError, "unknown url type: %s" % self.__original
 242         return self.type
 243
 244     def get_host(self):
 245         if self.host is None:
 246             self.host, self.__r_host = splithost(self.__r_type)
 247             if self.host:
 248                 self.host = unquote(self.host)
 249         return self.host
 250
 251     def get_selector(self):
 252         return self.__r_host
 253
 254     def set_proxy(self, host, type):
 255         self.host, self.type = host, type
 256         self.__r_host = self.__original
 257
 258     def get_origin_req_host(self):
 259         return self.origin_req_host
 260
 261     def is_unverifiable(self):
 262         return self.unverifiable
 263
 264     def add_header(self, key, val):
 265         # useful for something like authentication
 266         self.headers[key.capitalize()] = val
 267
 268     def add_unredirected_header(self, key, val):
 269         # will not be added to a redirected request
 270         self.unredirected_hdrs[key.capitalize()] = val
 271
 272     def has_header(self, header_name):
 273         return (header_name in self.headers or
 274                 header_name in self.unredirected_hdrs)
 275
 276     def get_header(self, header_name, default=None):
 277         return self.headers.get(
 278             header_name,
 279             self.unredirected_hdrs.get(header_name, default))
 280
 281     def header_items(self):
 282         hdrs = self.unredirected_hdrs.copy()
 283         hdrs.update(self.headers)
 284         return hdrs.items()
 285
 286 class OpenerDirector:
 287     def __init__(self):
 288         client_version = "Python-urllib/%s" % __version__
 289         self.addheaders = [('User-agent', client_version)]
 290         # manage the individual handlers
 291         self.handlers = []
 292         self.handle_open = {}
 293         self.handle_error = {}
 294         self.process_response = {}
 295         self.process_request = {}
 296
 297     def add_handler(self, handler):
 298         added = False
 299         for meth in dir(handler):
 300             i = meth.find("_")
 301             protocol = meth[:i]
 302             condition = meth[i+1:]
 303
 304             if condition.startswith("error"):
 305                 j = condition.find("_") + i + 1
 306                 kind = meth[j+1:]
 307                 try:
 308                     kind = int(kind)
 309                 except ValueError:
 310                     pass
 311                 lookup = self.handle_error.get(protocol, {})
 312                 self.handle_error[protocol] = lookup
 313             elif condition == "open":
 314                 kind = protocol
 315                 lookup = self.handle_open
 316             elif condition == "response":
 317                 kind = protocol
 318                 lookup = self.process_response
 319             elif condition == "request":
 320                 kind = protocol
 321                 lookup = self.process_request
 322             else:
 323                 continue
 324
 325             handlers = lookup.setdefault(kind, [])
 326             if handlers:
 327                 bisect.insort(handlers, handler)
 328             else:
 329                 handlers.append(handler)
 330             added = True
 331
 332         if added:
 333             # XXX why does self.handlers need to be sorted?
 334             bisect.insort(self.handlers, handler)
 335             handler.add_parent(self)
 336
 337     def close(self):
 338         # Only exists for backwards compatibility.
 339         pass
 340
 341     def _call_chain(self, chain, kind, meth_name, *args):
 342         # Handlers raise an exception if no one else should try to handle
 343         # the request, or return None if they can't but another handler
 344         # could.  Otherwise, they return the response.
 345         handlers = chain.get(kind, ())
 346         for handler in handlers:
 347             func = getattr(handler, meth_name)
 348
 349             result = func(*args)
 350             if result is not None:
 351                 return result
 352
 353     def open(self, fullurl, data=None):
 354         # accept a URL or a Request object
 355         if isinstance(fullurl, basestring):
 356             req = Request(fullurl, data)
 357         else:
 358             req = fullurl
 359             if data is not None:
 360                 req.add_data(data)
 361
 362         protocol = req.get_type()
 363
 364         # pre-process request
 365         meth_name = protocol+"_request"
 366         for processor in self.process_request.get(protocol, []):
 367             meth = getattr(processor, meth_name)
 368             req = meth(req)
 369
 370         response = self._open(req, data)
 371
 372         # post-process response
 373         meth_name = protocol+"_response"
 374         for processor in self.process_response.get(protocol, []):
 375             meth = getattr(processor, meth_name)
 376             response = meth(req, response)
 377
 378         return response
 379
 380     def _open(self, req, data=None):
 381         result = self._call_chain(self.handle_open, 'default',
 382                                   'default_open', req)
 383         if result:
 384             return result
 385
 386         protocol = req.get_type()
 387         result = self._call_chain(self.handle_open, protocol, protocol +
 388                                   '_open', req)
 389         if result:
 390             return result
 391
 392         return self._call_chain(self.handle_open, 'unknown',
 393                                 'unknown_open', req)
 394
 395     def error(self, proto, *args):
 396         if proto in ('http', 'https'):
 397             # XXX http[s] protocols are special-cased
 398             dict = self.handle_error['http'] # https is not different than http
 399             proto = args[2]  # YUCK!
 400             meth_name = 'http_error_%s' % proto
 401             http_err = 1
 402             orig_args = args
 403         else:
 404             dict = self.handle_error
 405             meth_name = proto + '_error'
 406             http_err = 0
 407         args = (dict, proto, meth_name) + args
 408         result = self._call_chain(*args)
 409         if result:
 410             return result
 411
 412         if http_err:
 413             args = (dict, 'default', 'http_error_default') + orig_args
 414             return self._call_chain(*args)
 415
 416 # XXX probably also want an abstract factory that knows when it makes
 417 # sense to skip a superclass in favor of a subclass and when it might
 418 # make sense to include both
 419
 420 def build_opener(*handlers):
 421     """Create an opener object from a list of handlers.
 422
 423     The opener will use several default handlers, including support
 424     for HTTP and FTP.
 425
 426     If any of the handlers passed as arguments are subclasses of the
 427     default handlers, the default handlers will not be used.
 428     """
 429     import types
 430     def isclass(obj):
 431         return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
 432
 433     opener = OpenerDirector()
 434     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 435                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 436                        FTPHandler, FileHandler, HTTPErrorProcessor]
 437     if hasattr(httplib, 'HTTPS'):
 438         default_classes.append(HTTPSHandler)
 439     skip = []
 440     for klass in default_classes:
 441         for check in handlers:
 442             if isclass(check):
 443                 if issubclass(check, klass):
 444                     skip.append(klass)
 445             elif isinstance(check, klass):
 446                 skip.append(klass)
 447     for klass in skip:
 448         default_classes.remove(klass)
 449
 450     for klass in default_classes:
 451         opener.add_handler(klass())
 452
 453     for h in handlers:
 454         if isclass(h):
 455             h = h()
 456         opener.add_handler(h)
 457     return opener
 458
 459 class BaseHandler:
 460     handler_order = 500
 461
 462     def add_parent(self, parent):
 463         self.parent = parent
 464
 465     def close(self):
 466         # Only exists for backwards compatibility
 467         pass
 468
 469     def __lt__(self, other):
 470         if not hasattr(other, "handler_order"):
 471             # Try to preserve the old behavior of having custom classes
 472             # inserted after default ones (works only for custom user
 473             # classes which are not aware of handler_order).
 474             return True
 475         return self.handler_order < other.handler_order
 476
 477
 478 class HTTPErrorProcessor(BaseHandler):
 479     """Process HTTP error responses."""
 480     handler_order = 1000  # after all other processing
 481
 482     def http_response(self, request, response):
 483         code, msg, hdrs = response.code, response.msg, response.info()
 484
 485         if code not in (200, 206):
 486             response = self.parent.error(
 487                 'http', request, response, code, msg, hdrs)
 488
 489         return response
 490
 491     https_response = http_response
 492
 493 class HTTPDefaultErrorHandler(BaseHandler):
 494     def http_error_default(self, req, fp, code, msg, hdrs):
 495         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 496
 497 class HTTPRedirectHandler(BaseHandler):
 498     # maximum number of redirections to any single URL
 499     # this is needed because of the state that cookies introduce
 500     max_repeats = 4
 501     # maximum total number of redirections (regardless of URL) before
 502     # assuming we're in a loop
 503     max_redirections = 10
 504
 505     def redirect_request(self, req, fp, code, msg, headers, newurl):
 506         """Return a Request or None in response to a redirect.
 507
 508         This is called by the http_error_30x methods when a
 509         redirection response is received.  If a redirection should
 510         take place, return a new Request to allow http_error_30x to
 511         perform the redirect.  Otherwise, raise HTTPError if no-one
 512         else should try to handle this url.  Return None if you can't
 513         but another Handler might.
 514         """
 515         m = req.get_method()
 516         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
 517             or code in (301, 302, 303) and m == "POST"):
 518             # Strictly (according to RFC 2616), 301 or 302 in response
 519             # to a POST MUST NOT cause a redirection without confirmation
 520             # from the user (of urllib2, in this case).  In practice,
 521             # essentially all clients do redirect in this case, so we
 522             # do the same.
 523             # be conciliant with URIs containing a space
 524             newurl = newurl.replace(' ', '%20')
 525             return Request(newurl,
 526                            headers=req.headers,
 527                            origin_req_host=req.get_origin_req_host(),
 528                            unverifiable=True)
 529         else:
 530             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 531
 532     # Implementation note: To avoid the server sending us into an
 533     # infinite loop, the request object needs to track what URLs we
 534     # have already seen.  Do this by adding a handler-specific
 535     # attribute to the Request object.
 536     def http_error_302(self, req, fp, code, msg, headers):
 537         # Some servers (incorrectly) return multiple Location headers
 538         # (so probably same goes for URI).  Use first header.
 539         if 'location' in headers:
 540             newurl = headers.getheaders('location')[0]
 541         elif 'uri' in headers:
 542             newurl = headers.getheaders('uri')[0]
 543         else:
 544             return
 545         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 546
 547         # XXX Probably want to forget about the state of the current
 548         # request, although that might interact poorly with other
 549         # handlers that also use handler-specific request attributes
 550         new = self.redirect_request(req, fp, code, msg, headers, newurl)
 551         if new is None:
 552             return
 553
 554         # loop detection
 555         # .redirect_dict has a key url if url was previously visited.
 556         if hasattr(req, 'redirect_dict'):
 557             visited = new.redirect_dict = req.redirect_dict
 558             if (visited.get(newurl, 0) >= self.max_repeats or
 559                 len(visited) >= self.max_redirections):
 560                 raise HTTPError(req.get_full_url(), code,
 561                                 self.inf_msg + msg, headers, fp)
 562         else:
 563             visited = new.redirect_dict = req.redirect_dict = {}
 564         visited[newurl] = visited.get(newurl, 0) + 1
 565
 566         # Don't close the fp until we are sure that we won't use it
 567         # with HTTPError.
 568         fp.read()
 569         fp.close()
 570
 571         return self.parent.open(new)
 572
 573     http_error_301 = http_error_303 = http_error_307 = http_error_302
 574
 575     inf_msg = "The HTTP server returned a redirect error that would " \
 576               "lead to an infinite loop.\n" \
 577               "The last 30x error message was:\n"
 578
 579
 580 def _parse_proxy(proxy):
 581     """Return (scheme, user, password, host/port) given a URL or an authority.
 582
 583     If a URL is supplied, it must have an authority (host:port) component.
 584     According to RFC 3986, having an authority component means the URL must
 585     have two slashes after the scheme:
 586
 587     >>> _parse_proxy('file:/ftp.example.com/')
 588     Traceback (most recent call last):
 589     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
 590
 591     The first three items of the returned tuple may be None.
 592
 593     Examples of authority parsing:
 594
 595     >>> _parse_proxy('proxy.example.com')
 596     (None, None, None, 'proxy.example.com')
 597     >>> _parse_proxy('proxy.example.com:3128')
 598     (None, None, None, 'proxy.example.com:3128')
 599
 600     The authority component may optionally include userinfo (assumed to be
 601     username:password):
 602
 603     >>> _parse_proxy('joe:password@proxy.example.com')
 604     (None, 'joe', 'password', 'proxy.example.com')
 605     >>> _parse_proxy('joe:password@proxy.example.com:3128')
 606     (None, 'joe', 'password', 'proxy.example.com:3128')
 607
 608     Same examples, but with URLs instead:
 609
 610     >>> _parse_proxy('http://proxy.example.com/')
 611     ('http', None, None, 'proxy.example.com')
 612     >>> _parse_proxy('http://proxy.example.com:3128/')
 613     ('http', None, None, 'proxy.example.com:3128')
 614     >>> _parse_proxy('http://joe:password@proxy.example.com/')
 615     ('http', 'joe', 'password', 'proxy.example.com')
 616     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
 617     ('http', 'joe', 'password', 'proxy.example.com:3128')
 618
 619     Everything after the authority is ignored:
 620
 621     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
 622     ('ftp', 'joe', 'password', 'proxy.example.com')
 623
 624     Test for no trailing '/' case:
 625
 626     >>> _parse_proxy('http://joe:password@proxy.example.com')
 627     ('http', 'joe', 'password', 'proxy.example.com')
 628
 629     """
 630     scheme, r_scheme = splittype(proxy)
 631     if not r_scheme.startswith("/"):
 632         # authority
 633         scheme = None
 634         authority = proxy
 635     else:
 636         # URL
 637         if not r_scheme.startswith("//"):
 638             raise ValueError("proxy URL with no authority: %r" % proxy)
 639         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
 640         # and 3.3.), path is empty or starts with '/'
 641         end = r_scheme.find("/", 2)
 642         if end == -1:
 643             end = None
 644         authority = r_scheme[2:end]
 645     userinfo, hostport = splituser(authority)
 646     if userinfo is not None:
 647         user, password = splitpasswd(userinfo)
 648     else:
 649         user = password = None
 650     return scheme, user, password, hostport
 651
 652 class ProxyHandler(BaseHandler):
 653     # Proxies must be in front
 654     handler_order = 100
 655
 656     def __init__(self, proxies=None):
 657         if proxies is None:
 658             proxies = getproxies()
 659         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 660         self.proxies = proxies
 661         for type, url in proxies.items():
 662             setattr(self, '%s_open' % type,
 663                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 664                     meth(r, proxy, type))
 665
 666     def proxy_open(self, req, proxy, type):
 667         orig_type = req.get_type()
 668         proxy_type, user, password, hostport = _parse_proxy(proxy)
 669         if proxy_type is None:
 670             proxy_type = orig_type
 671         if user and password:
 672             user_pass = '%s:%s' % (unquote(user), unquote(password))
 673             creds = base64.encodestring(user_pass).strip()
 674             req.add_header('Proxy-authorization', 'Basic ' + creds)
 675         hostport = unquote(hostport)
 676         req.set_proxy(hostport, proxy_type)
 677         if orig_type == proxy_type:
 678             # let other handlers take care of it
 679             return None
 680         else:
 681             # need to start over, because the other handlers don't
 682             # grok the proxy's URL type
 683             # e.g. if we have a constructor arg proxies like so:
 684             # {'http': 'ftp://proxy.example.com'}, we may end up turning
 685             # a request for http://acme.example.com/a into one for
 686             # ftp://proxy.example.com/a
 687             return self.parent.open(req)
 688
 689 class HTTPPasswordMgr:
 690
 691     def __init__(self):
 692         self.passwd = {}
 693
 694     def add_password(self, realm, uri, user, passwd):
 695         # uri could be a single URI or a sequence
 696         if isinstance(uri, basestring):
 697             uri = [uri]
 698         uri = tuple(map(self.reduce_uri, uri))
 699         if not realm in self.passwd:
 700             self.passwd[realm] = {}
 701         self.passwd[realm][uri] = (user, passwd)
 702
 703     def find_user_password(self, realm, authuri):
 704         domains = self.passwd.get(realm, {})
 705         authuri = self.reduce_uri(authuri)
 706         for uris, authinfo in domains.iteritems():
 707             for uri in uris:
 708                 if self.is_suburi(uri, authuri):
 709                     return authinfo
 710         return None, None
 711
 712     def reduce_uri(self, uri):
 713         """Accept netloc or URI and extract only the netloc and path"""
 714         parts = urlparse.urlsplit(uri)
 715         if parts[1]:
 716             # URI
 717             return parts[1], parts[2] or '/'
 718         elif parts[0]:
 719             # host:port
 720             return uri, '/'
 721         else:
 722             # host
 723             return parts[2], '/'
 724
 725     def is_suburi(self, base, test):
 726         """Check if test is below base in a URI tree
 727
 728         Both args must be URIs in reduced form.
 729         """
 730         if base == test:
 731             return True
 732         if base[0] != test[0]:
 733             return False
 734         common = posixpath.commonprefix((base[1], test[1]))
 735         if len(common) == len(base[1]):
 736             return True
 737         return False
 738
 739
 740 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 741
 742     def find_user_password(self, realm, authuri):
 743         user, password = HTTPPasswordMgr.find_user_password(self, realm,
 744                                                             authuri)
 745         if user is not None:
 746             return user, password
 747         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 748
 749
 750 class AbstractBasicAuthHandler:
 751
 752     rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
 753
 754     # XXX there can actually be multiple auth-schemes in a
 755     # www-authenticate header.  should probably be a lot more careful
 756     # in parsing them to extract multiple alternatives
 757
 758     def __init__(self, password_mgr=None):
 759         if password_mgr is None:
 760             password_mgr = HTTPPasswordMgr()
 761         self.passwd = password_mgr
 762         self.add_password = self.passwd.add_password
 763
 764     def http_error_auth_reqed(self, authreq, host, req, headers):
 765         # host may be an authority (without userinfo) or a URL with an
 766         # authority
 767         # XXX could be multiple headers
 768         authreq = headers.get(authreq, None)
 769         if authreq:
 770             mo = AbstractBasicAuthHandler.rx.search(authreq)
 771             if mo:
 772                 scheme, realm = mo.groups()
 773                 if scheme.lower() == 'basic':
 774                     return self.retry_http_basic_auth(host, req, realm)
 775
 776     def retry_http_basic_auth(self, host, req, realm):
 777         user, pw = self.passwd.find_user_password(realm, host)
 778         if pw is not None:
 779             raw = "%s:%s" % (user, pw)
 780             auth = 'Basic %s' % base64.encodestring(raw).strip()
 781             if req.headers.get(self.auth_header, None) == auth:
 782                 return None
 783             req.add_header(self.auth_header, auth)
 784             return self.parent.open(req)
 785         else:
 786             return None
 787
 788
 789 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 790
 791     auth_header = 'Authorization'
 792
 793     def http_error_401(self, req, fp, code, msg, headers):
 794         url = req.get_full_url()
 795         return self.http_error_auth_reqed('www-authenticate',
 796                                           url, req, headers)
 797
 798
 799 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 800
 801     auth_header = 'Proxy-authorization'
 802
 803     def http_error_407(self, req, fp, code, msg, headers):
 804         # http_error_auth_reqed requires that there is no userinfo component in
 805         # authority.  Assume there isn't one, since urllib2 does not (and
 806         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
 807         # userinfo.
 808         authority = req.get_host()
 809         return self.http_error_auth_reqed('proxy-authenticate',
 810                                           authority, req, headers)
 811
 812
 813 def randombytes(n):
 814     """Return n random bytes."""
 815     # Use /dev/urandom if it is available.  Fall back to random module
 816     # if not.  It might be worthwhile to extend this function to use
 817     # other platform-specific mechanisms for getting random bytes.
 818     if os.path.exists("/dev/urandom"):
 819         f = open("/dev/urandom")
 820         s = f.read(n)
 821         f.close()
 822         return s
 823     else:
 824         L = [chr(random.randrange(0, 256)) for i in range(n)]
 825         return "".join(L)
 826
 827 class AbstractDigestAuthHandler:
 828     # Digest authentication is specified in RFC 2617.
 829
 830     # XXX The client does not inspect the Authentication-Info header
 831     # in a successful response.
 832
 833     # XXX It should be possible to test this implementation against
 834     # a mock server that just generates a static set of challenges.
 835
 836     # XXX qop="auth-int" supports is shaky
 837
 838     def __init__(self, passwd=None):
 839         if passwd is None:
 840             passwd = HTTPPasswordMgr()
 841         self.passwd = passwd
 842         self.add_password = self.passwd.add_password
 843         self.retried = 0
 844         self.nonce_count = 0
 845
 846     def reset_retry_count(self):
 847         self.retried = 0
 848
 849     def http_error_auth_reqed(self, auth_header, host, req, headers):
 850         authreq = headers.get(auth_header, None)
 851         if self.retried > 5:
 852             # Don't fail endlessly - if we failed once, we'll probably
 853             # fail a second time. Hm. Unless the Password Manager is
 854             # prompting for the information. Crap. This isn't great
 855             # but it's better than the current 'repeat until recursion
 856             # depth exceeded' approach <wink>
 857             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
 858                             headers, None)
 859         else:
 860             self.retried += 1
 861         if authreq:
 862             scheme = authreq.split()[0]
 863             if scheme.lower() == 'digest':
 864                 return self.retry_http_digest_auth(req, authreq)
 865
 866     def retry_http_digest_auth(self, req, auth):
 867         token, challenge = auth.split(' ', 1)
 868         chal = parse_keqv_list(parse_http_list(challenge))
 869         auth = self.get_authorization(req, chal)
 870         if auth:
 871             auth_val = 'Digest %s' % auth
 872             if req.headers.get(self.auth_header, None) == auth_val:
 873                 return None
 874             req.add_unredirected_header(self.auth_header, auth_val)
 875             resp = self.parent.open(req)
 876             return resp
 877
 878     def get_cnonce(self, nonce):
 879         # The cnonce-value is an opaque
 880         # quoted string value provided by the client and used by both client
 881         # and server to avoid chosen plaintext attacks, to provide mutual
 882         # authentication, and to provide some message integrity protection.
 883         # This isn't a fabulous effort, but it's probably Good Enough.
 884         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
 885                                             randombytes(8))).hexdigest()
 886         return dig[:16]
 887
 888     def get_authorization(self, req, chal):
 889         try:
 890             realm = chal['realm']
 891             nonce = chal['nonce']
 892             qop = chal.get('qop')
 893             algorithm = chal.get('algorithm', 'MD5')
 894             # mod_digest doesn't send an opaque, even though it isn't
 895             # supposed to be optional
 896             opaque = chal.get('opaque', None)
 897         except KeyError:
 898             return None
 899
 900         H, KD = self.get_algorithm_impls(algorithm)
 901         if H is None:
 902             return None
 903
 904         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
 905         if user is None:
 906             return None
 907
 908         # XXX not implemented yet
 909         if req.has_data():
 910             entdig = self.get_entity_digest(req.get_data(), chal)
 911         else:
 912             entdig = None
 913
 914         A1 = "%s:%s:%s" % (user, realm, pw)
 915         A2 = "%s:%s" % (req.get_method(),
 916                         # XXX selector: what about proxies and full urls
 917                         req.get_selector())
 918         if qop == 'auth':
 919             self.nonce_count += 1
 920             ncvalue = '%08x' % self.nonce_count
 921             cnonce = self.get_cnonce(nonce)
 922             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
 923             respdig = KD(H(A1), noncebit)
 924         elif qop is None:
 925             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 926         else:
 927             # XXX handle auth-int.
 928             pass
 929
 930         # XXX should the partial digests be encoded too?
 931
 932         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 933                'response="%s"' % (user, realm, nonce, req.get_selector(),
 934                                   respdig)
 935         if opaque:
 936             base += ', opaque="%s"' % opaque
 937         if entdig:
 938             base += ', digest="%s"' % entdig
 939         base += ', algorithm="%s"' % algorithm
 940         if qop:
 941             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
 942         return base
 943
 944     def get_algorithm_impls(self, algorithm):
 945         # lambdas assume digest modules are imported at the top level
 946         if algorithm == 'MD5':
 947             H = lambda x: hashlib.md5(x).hexdigest()
 948         elif algorithm == 'SHA':
 949             H = lambda x: hashlib.sha1(x).hexdigest()
 950         # XXX MD5-sess
 951         KD = lambda s, d: H("%s:%s" % (s, d))
 952         return H, KD
 953
 954     def get_entity_digest(self, data, chal):
 955         # XXX not implemented yet
 956         return None
 957
 958
 959 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 960     """An authentication protocol defined by RFC 2069
 961
 962     Digest authentication improves on basic authentication because it
 963     does not transmit passwords in the clear.
 964     """
 965
 966     auth_header = 'Authorization'
 967
 968     def http_error_401(self, req, fp, code, msg, headers):
 969         host = urlparse.urlparse(req.get_full_url())[1]
 970         retry = self.http_error_auth_reqed('www-authenticate',
 971                                            host, req, headers)
 972         self.reset_retry_count()
 973         return retry
 974
 975
 976 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 977
 978     auth_header = 'Proxy-Authorization'
 979
 980     def http_error_407(self, req, fp, code, msg, headers):
 981         host = req.get_host()
 982         retry = self.http_error_auth_reqed('proxy-authenticate',
 983                                            host, req, headers)
 984         self.reset_retry_count()
 985         return retry
 986
 987 class AbstractHTTPHandler(BaseHandler):
 988
 989     def __init__(self, debuglevel=0):
 990         self._debuglevel = debuglevel
 991
 992     def set_http_debuglevel(self, level):
 993         self._debuglevel = level
 994
 995     def do_request_(self, request):
 996         host = request.get_host()
 997         if not host:
 998             raise URLError('no host given')
 999
1000         if request.has_data():  # POST
1001             data = request.get_data()
1002             if not request.has_header('Content-type'):
1003                 request.add_unredirected_header(
1004                     'Content-type',
1005                     'application/x-www-form-urlencoded')
1006             if not request.has_header('Content-length'):
1007                 request.add_unredirected_header(
1008                     'Content-length', '%d' % len(data))
1009
1010         scheme, sel = splittype(request.get_selector())
1011         sel_host, sel_path = splithost(sel)
1012         if not request.has_header('Host'):
1013             request.add_unredirected_header('Host', sel_host or host)
1014         for name, value in self.parent.addheaders:
1015             name = name.capitalize()
1016             if not request.has_header(name):
1017                 request.add_unredirected_header(name, value)
1018
1019         return request
1020
1021     def do_open(self, http_class, req):
1022         """Return an addinfourl object for the request, using http_class.
1023
1024         http_class must implement the HTTPConnection API from httplib.
1025         The addinfourl return value is a file-like object.  It also
1026         has methods and attributes including:
1027             - info(): return a mimetools.Message object for the headers
1028             - geturl(): return the original request URL
1029             - code: HTTP status code
1030         """
1031         host = req.get_host()
1032         if not host:
1033             raise URLError('no host given')
1034
1035         h = http_class(host) # will parse host:port
1036         h.set_debuglevel(self._debuglevel)
1037
1038         headers = dict(req.headers)
1039         headers.update(req.unredirected_hdrs)
1040         # We want to make an HTTP/1.1 request, but the addinfourl
1041         # class isn't prepared to deal with a persistent connection.
1042         # It will try to read all remaining data from the socket,
1043         # which will block while the server waits for the next request.
1044         # So make sure the connection gets closed after the (only)
1045         # request.
1046         headers["Connection"] = "close"
1047         try:
1048             h.request(req.get_method(), req.get_selector(), req.data, headers)
1049             r = h.getresponse()
1050         except socket.error, err: # XXX what error?
1051             raise URLError(err)
1052
1053         # Pick apart the HTTPResponse object to get the addinfourl
1054         # object initialized properly.
1055
1056         # Wrap the HTTPResponse object in socket's file object adapter
1057         # for Windows.  That adapter calls recv(), so delegate recv()
1058         # to read().  This weird wrapping allows the returned object to
1059         # have readline() and readlines() methods.
1060
1061         # XXX It might be better to extract the read buffering code
1062         # out of socket._fileobject() and into a base class.
1063
1064         r.recv = r.read
1065         fp = socket._fileobject(r)
1066
1067         resp = addinfourl(fp, r.msg, req.get_full_url())
1068         resp.code = r.status
1069         resp.msg = r.reason
1070         return resp
1071
1072
1073 class HTTPHandler(AbstractHTTPHandler):
1074
1075     def http_open(self, req):
1076         return self.do_open(httplib.HTTPConnection, req)
1077
1078     http_request = AbstractHTTPHandler.do_request_
1079
1080 if hasattr(httplib, 'HTTPS'):
1081     class HTTPSHandler(AbstractHTTPHandler):
1082
1083         def https_open(self, req):
1084             return self.do_open(httplib.HTTPSConnection, req)
1085
1086         https_request = AbstractHTTPHandler.do_request_
1087
1088 class HTTPCookieProcessor(BaseHandler):
1089     def __init__(self, cookiejar=None):
1090         import cookielib
1091         if cookiejar is None:
1092             cookiejar = cookielib.CookieJar()
1093         self.cookiejar = cookiejar
1094
1095     def http_request(self, request):
1096         self.cookiejar.add_cookie_header(request)
1097         return request
1098
1099     def http_response(self, request, response):
1100         self.cookiejar.extract_cookies(response, request)
1101         return response
1102
1103     https_request = http_request
1104     https_response = http_response
1105
1106 class UnknownHandler(BaseHandler):
1107     def unknown_open(self, req):
1108         type = req.get_type()
1109         raise URLError('unknown url type: %s' % type)
1110
1111 def parse_keqv_list(l):
1112     """Parse list of key=value strings where keys are not duplicated."""
1113     parsed = {}
1114     for elt in l:
1115         k, v = elt.split('=', 1)
1116         if v[0] == '"' and v[-1] == '"':
1117             v = v[1:-1]
1118         parsed[k] = v
1119     return parsed
1120
1121 def parse_http_list(s):
1122     """Parse lists as described by RFC 2068 Section 2.
1123
1124     In particular, parse comma-separated lists where the elements of
1125     the list may include quoted-strings.  A quoted-string could
1126     contain a comma.  A non-quoted string could have quotes in the
1127     middle.  Neither commas nor quotes count if they are escaped.
1128     Only double-quotes count, not single-quotes.
1129     """
1130     res = []
1131     part = ''
1132
1133     escape = quote = False
1134     for cur in s:
1135         if escape:
1136             part += cur
1137             escape = False
1138             continue
1139         if quote:
1140             if cur == '\\':
1141                 escape = True
1142                 continue
1143             elif cur == '"':
1144                 quote = False
1145             part += cur
1146             continue
1147
1148         if cur == ',':
1149             res.append(part)
1150             part = ''
1151             continue
1152
1153         if cur == '"':
1154             quote = True
1155
1156         part += cur
1157
1158     # append last part
1159     if part:
1160         res.append(part)
1161
1162     return [part.strip() for part in res]
1163
1164 class FileHandler(BaseHandler):
1165     # Use local file or FTP depending on form of URL
1166     def file_open(self, req):
1167         url = req.get_selector()
1168         if url[:2] == '//' and url[2:3] != '/':
1169             req.type = 'ftp'
1170             return self.parent.open(req)
1171         else:
1172             return self.open_local_file(req)
1173
1174     # names for the localhost
1175     names = None
1176     def get_names(self):
1177         if FileHandler.names is None:
1178             try:
1179                 FileHandler.names = (socket.gethostbyname('localhost'),
1180                                     socket.gethostbyname(socket.gethostname()))
1181             except socket.gaierror:
1182                 FileHandler.names = (socket.gethostbyname('localhost'),)
1183         return FileHandler.names
1184
1185     # not entirely sure what the rules are here
1186     def open_local_file(self, req):
1187         import email.Utils
1188         import mimetypes
1189         host = req.get_host()
1190         file = req.get_selector()
1191         localfile = url2pathname(file)
1192         stats = os.stat(localfile)
1193         size = stats.st_size
1194         modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
1195         mtype = mimetypes.guess_type(file)[0]
1196         headers = mimetools.Message(StringIO(
1197             'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1198             (mtype or 'text/plain', size, modified)))
1199         if host:
1200             host, port = splitport(host)
1201         if not host or \
1202            (not port and socket.gethostbyname(host) in self.get_names()):
1203             return addinfourl(open(localfile, 'rb'),
1204                               headers, 'file:'+file)
1205         raise URLError('file not on local host')
1206
1207 class FTPHandler(BaseHandler):
1208     def ftp_open(self, req):
1209         import ftplib
1210         import mimetypes
1211         host = req.get_host()
1212         if not host:
1213             raise IOError, ('ftp error', 'no host given')
1214         host, port = splitport(host)
1215         if port is None:
1216             port = ftplib.FTP_PORT
1217         else:
1218             port = int(port)
1219
1220         # username/password handling
1221         user, host = splituser(host)
1222         if user:
1223             user, passwd = splitpasswd(user)
1224         else:
1225             passwd = None
1226         host = unquote(host)
1227         user = unquote(user or '')
1228         passwd = unquote(passwd or '')
1229
1230         try:
1231             host = socket.gethostbyname(host)
1232         except socket.error, msg:
1233             raise URLError(msg)
1234         path, attrs = splitattr(req.get_selector())
1235         dirs = path.split('/')
1236         dirs = map(unquote, dirs)
1237         dirs, file = dirs[:-1], dirs[-1]
1238         if dirs and not dirs[0]:
1239             dirs = dirs[1:]
1240         try:
1241             fw = self.connect_ftp(user, passwd, host, port, dirs)
1242             type = file and 'I' or 'D'
1243             for attr in attrs:
1244                 attr, value = splitvalue(attr)
1245                 if attr.lower() == 'type' and \
1246                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
1247                     type = value.upper()
1248             fp, retrlen = fw.retrfile(file, type)
1249             headers = ""
1250             mtype = mimetypes.guess_type(req.get_full_url())[0]
1251             if mtype:
1252                 headers += "Content-type: %s\n" % mtype
1253             if retrlen is not None and retrlen >= 0:
1254                 headers += "Content-length: %d\n" % retrlen
1255             sf = StringIO(headers)
1256             headers = mimetools.Message(sf)
1257             return addinfourl(fp, headers, req.get_full_url())
1258         except ftplib.all_errors, msg:
1259             raise IOError, ('ftp error', msg), sys.exc_info()[2]
1260
1261     def connect_ftp(self, user, passwd, host, port, dirs):
1262         fw = ftpwrapper(user, passwd, host, port, dirs)
1263 ##        fw.ftp.set_debuglevel(1)
1264         return fw
1265
1266 class CacheFTPHandler(FTPHandler):
1267     # XXX would be nice to have pluggable cache strategies
1268     # XXX this stuff is definitely not thread safe
1269     def __init__(self):
1270         self.cache = {}
1271         self.timeout = {}
1272         self.soonest = 0
1273         self.delay = 60
1274         self.max_conns = 16
1275
1276     def setTimeout(self, t):
1277         self.delay = t
1278
1279     def setMaxConns(self, m):
1280         self.max_conns = m
1281
1282     def connect_ftp(self, user, passwd, host, port, dirs):
1283         key = user, host, port, '/'.join(dirs)
1284         if key in self.cache:
1285             self.timeout[key] = time.time() + self.delay
1286         else:
1287             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1288             self.timeout[key] = time.time() + self.delay
1289         self.check_cache()
1290         return self.cache[key]
1291
1292     def check_cache(self):
1293         # first check for old ones
1294         t = time.time()
1295         if self.soonest <= t:
1296             for k, v in self.timeout.items():
1297                 if v < t:
1298                     self.cache[k].close()
1299                     del self.cache[k]
1300                     del self.timeout[k]
1301         self.soonest = min(self.timeout.values())
1302
1303         # then check the size
1304         if len(self.cache) == self.max_conns:
1305             for k, v in self.timeout.items():
1306                 if v == self.soonest:
1307                     del self.cache[k]
1308                     del self.timeout[k]
1309                     break
1310             self.soonest = min(self.timeout.values())
1311
1312 class GopherHandler(BaseHandler):
1313     def gopher_open(self, req):
1314         # XXX can raise socket.error
1315         import gopherlib  # this raises DeprecationWarning in 2.5
1316         host = req.get_host()
1317         if not host:
1318             raise GopherError('no host given')
1319         host = unquote(host)
1320         selector = req.get_selector()
1321         type, selector = splitgophertype(selector)
1322         selector, query = splitquery(selector)
1323         selector = unquote(selector)
1324         if query:
1325             query = unquote(query)
1326             fp = gopherlib.send_query(selector, query, host)
1327         else:
1328             fp = gopherlib.send_selector(selector, host)
1329         return addinfourl(fp, noheaders(), req.get_full_url())