Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15 deals with digest authentication.
  16
  17 urlopen(url, data=None) -- basic usage is that same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- function that creates a new OpenerDirector instance.
  25 will install the default handlers.  accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  if one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- an object that encapsulates the state of a request.  the
  36 state can be a simple as the URL.  it can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError-- a subclass of IOError, individual protocols have their own
  43 specific subclass
  44
  45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password('realm', 'host', 'username', 'password')
  59
  60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  61
  62 # build a new opener that adds authentication and caching FTP handlers
  63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  64
  65 # install it
  66 urllib2.install_opener(opener)
  67
  68 f = urllib2.urlopen('http://www.python.org/')
  69
  70
  71 """
  72
  73 # XXX issues:
  74 # If an authentication error handler that tries to perform
  75 # authentication for some reason but fails, how should the error be
  76 # signalled?  The client needs to know the HTTP error code.  But if
  77 # the handler knows that the problem was, e.g., that it didn't know
  78 # that hash algo that requested in the challenge, it would be good to
  79 # pass that information along to the client, too.
  80
  81 # XXX to do:
  82 # name!
  83 # documentation (getting there)
  84 # complex proxies
  85 # abstract factory for opener
  86 # ftp errors aren't handled cleanly
  87 # gopher can return a socket.error
  88 # check digest against correct (i.e. non-apache) implementation
  89
  90 import base64
  91 import ftplib
  92 import gopherlib
  93 import httplib
  94 import inspect
  95 import md5
  96 import mimetypes
  97 import mimetools
  98 import os
  99 import posixpath
 100 import random
 101 import re
 102 import sha
 103 import socket
 104 import sys
 105 import time
 106 import urlparse
 107 import bisect
 108 import cookielib
 109
 110 try:
 111     from cStringIO import StringIO
 112 except ImportError:
 113     from StringIO import StringIO
 114
 115 # not sure how many of these need to be gotten rid of
 116 from urllib import (unwrap, unquote, splittype, splithost,
 117      addinfourl, splitport, splitgophertype, splitquery,
 118      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
 119
 120 # support for FileHandler, proxies via environment variables
 121 from urllib import localhost, url2pathname, getproxies
 122
 123 __version__ = "2.4"
 124
 125 _opener = None
 126 def urlopen(url, data=None):
 127     global _opener
 128     if _opener is None:
 129         _opener = build_opener()
 130     return _opener.open(url, data)
 131
 132 def install_opener(opener):
 133     global _opener
 134     _opener = opener
 135
 136 # do these error classes make sense?
 137 # make sure all of the IOError stuff is overridden.  we just want to be
 138 # subtypes.
 139
 140 class URLError(IOError):
 141     # URLError is a sub-type of IOError, but it doesn't share any of
 142     # the implementation.  need to override __init__ and __str__.
 143     # It sets self.args for compatibility with other EnvironmentError
 144     # subclasses, but args doesn't have the typical format with errno in
 145     # slot 0 and strerror in slot 1.  This may be better than nothing.
 146     def __init__(self, reason):
 147         self.args = reason,
 148         self.reason = reason
 149
 150     def __str__(self):
 151         return '<urlopen error %s>' % self.reason
 152
 153 class HTTPError(URLError, addinfourl):
 154     """Raised when HTTP error occurs, but also acts like non-error return"""
 155     __super_init = addinfourl.__init__
 156
 157     def __init__(self, url, code, msg, hdrs, fp):
 158         self.code = code
 159         self.msg = msg
 160         self.hdrs = hdrs
 161         self.fp = fp
 162         self.filename = url
 163         # The addinfourl classes depend on fp being a valid file
 164         # object.  In some cases, the HTTPError may not have a valid
 165         # file object.  If this happens, the simplest workaround is to
 166         # not initialize the base classes.
 167         if fp is not None:
 168             self.__super_init(fp, hdrs, url)
 169
 170     def __str__(self):
 171         return 'HTTP Error %s: %s' % (self.code, self.msg)
 172
 173 class GopherError(URLError):
 174     pass
 175
 176
 177 class Request:
 178
 179     def __init__(self, url, data=None, headers={},
 180                  origin_req_host=None, unverifiable=False):
 181         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 182         self.__original = unwrap(url)
 183         self.type = None
 184         # self.__r_type is what's left after doing the splittype
 185         self.host = None
 186         self.port = None
 187         self.data = data
 188         self.headers = {}
 189         for key, value in headers.items():
 190             self.add_header(key, value)
 191         self.unredirected_hdrs = {}
 192         if origin_req_host is None:
 193             origin_req_host = cookielib.request_host(self)
 194         self.origin_req_host = origin_req_host
 195         self.unverifiable = unverifiable
 196
 197     def __getattr__(self, attr):
 198         # XXX this is a fallback mechanism to guard against these
 199         # methods getting called in a non-standard order.  this may be
 200         # too complicated and/or unnecessary.
 201         # XXX should the __r_XXX attributes be public?
 202         if attr[:12] == '_Request__r_':
 203             name = attr[12:]
 204             if hasattr(Request, 'get_' + name):
 205                 getattr(self, 'get_' + name)()
 206                 return getattr(self, attr)
 207         raise AttributeError, attr
 208
 209     def get_method(self):
 210         if self.has_data():
 211             return "POST"
 212         else:
 213             return "GET"
 214
 215     # XXX these helper methods are lame
 216
 217     def add_data(self, data):
 218         self.data = data
 219
 220     def has_data(self):
 221         return self.data is not None
 222
 223     def get_data(self):
 224         return self.data
 225
 226     def get_full_url(self):
 227         return self.__original
 228
 229     def get_type(self):
 230         if self.type is None:
 231             self.type, self.__r_type = splittype(self.__original)
 232             if self.type is None:
 233                 raise ValueError, "unknown url type: %s" % self.__original
 234         return self.type
 235
 236     def get_host(self):
 237         if self.host is None:
 238             self.host, self.__r_host = splithost(self.__r_type)
 239             if self.host:
 240                 self.host = unquote(self.host)
 241         return self.host
 242
 243     def get_selector(self):
 244         return self.__r_host
 245
 246     def set_proxy(self, host, type):
 247         self.host, self.type = host, type
 248         self.__r_host = self.__original
 249
 250     def get_origin_req_host(self):
 251         return self.origin_req_host
 252
 253     def is_unverifiable(self):
 254         return self.unverifiable
 255
 256     def add_header(self, key, val):
 257         # useful for something like authentication
 258         self.headers[key.capitalize()] = val
 259
 260     def add_unredirected_header(self, key, val):
 261         # will not be added to a redirected request
 262         self.unredirected_hdrs[key.capitalize()] = val
 263
 264     def has_header(self, header_name):
 265         return (header_name in self.headers or
 266                 header_name in self.unredirected_hdrs)
 267
 268     def get_header(self, header_name, default=None):
 269         return self.headers.get(
 270             header_name,
 271             self.unredirected_hdrs.get(header_name, default))
 272
 273     def header_items(self):
 274         hdrs = self.unredirected_hdrs.copy()
 275         hdrs.update(self.headers)
 276         return hdrs.items()
 277
 278 class OpenerDirector:
 279     def __init__(self):
 280         client_version = "Python-urllib/%s" % __version__
 281         self.addheaders = [('User-agent', client_version)]
 282         # manage the individual handlers
 283         self.handlers = []
 284         self.handle_open = {}
 285         self.handle_error = {}
 286         self.process_response = {}
 287         self.process_request = {}
 288
 289     def add_handler(self, handler):
 290         added = False
 291         for meth in dir(handler):
 292             i = meth.find("_")
 293             protocol = meth[:i]
 294             condition = meth[i+1:]
 295
 296             if condition.startswith("error"):
 297                 j = condition.find("_") + i + 1
 298                 kind = meth[j+1:]
 299                 try:
 300                     kind = int(kind)
 301                 except ValueError:
 302                     pass
 303                 lookup = self.handle_error.get(protocol, {})
 304                 self.handle_error[protocol] = lookup
 305             elif condition == "open":
 306                 kind = protocol
 307                 lookup = self.handle_open
 308             elif condition == "response":
 309                 kind = protocol
 310                 lookup = self.process_response
 311             elif condition == "request":
 312                 kind = protocol
 313                 lookup = self.process_request
 314             else:
 315                 continue
 316
 317             handlers = lookup.setdefault(kind, [])
 318             if handlers:
 319                 bisect.insort(handlers, handler)
 320             else:
 321                 handlers.append(handler)
 322             added = True
 323
 324         if added:
 325             # XXX why does self.handlers need to be sorted?
 326             bisect.insort(self.handlers, handler)
 327             handler.add_parent(self)
 328
 329     def close(self):
 330         # Only exists for backwards compatibility.
 331         pass
 332
 333     def _call_chain(self, chain, kind, meth_name, *args):
 334         # XXX raise an exception if no one else should try to handle
 335         # this url.  return None if you can't but someone else could.
 336         handlers = chain.get(kind, ())
 337         for handler in handlers:
 338             func = getattr(handler, meth_name)
 339
 340             result = func(*args)
 341             if result is not None:
 342                 return result
 343
 344     def open(self, fullurl, data=None):
 345         # accept a URL or a Request object
 346         if isinstance(fullurl, basestring):
 347             req = Request(fullurl, data)
 348         else:
 349             req = fullurl
 350             if data is not None:
 351                 req.add_data(data)
 352
 353         protocol = req.get_type()
 354
 355         # pre-process request
 356         meth_name = protocol+"_request"
 357         for processor in self.process_request.get(protocol, []):
 358             meth = getattr(processor, meth_name)
 359             req = meth(req)
 360
 361         response = self._open(req, data)
 362
 363         # post-process response
 364         meth_name = protocol+"_response"
 365         for processor in self.process_response.get(protocol, []):
 366             meth = getattr(processor, meth_name)
 367             response = meth(req, response)
 368
 369         return response
 370
 371     def _open(self, req, data=None):
 372         result = self._call_chain(self.handle_open, 'default',
 373                                   'default_open', req)
 374         if result:
 375             return result
 376
 377         protocol = req.get_type()
 378         result = self._call_chain(self.handle_open, protocol, protocol +
 379                                   '_open', req)
 380         if result:
 381             return result
 382
 383         return self._call_chain(self.handle_open, 'unknown',
 384                                 'unknown_open', req)
 385
 386     def error(self, proto, *args):
 387         if proto in ('http', 'https'):
 388             # XXX http[s] protocols are special-cased
 389             dict = self.handle_error['http'] # https is not different than http
 390             proto = args[2]  # YUCK!
 391             meth_name = 'http_error_%s' % proto
 392             http_err = 1
 393             orig_args = args
 394         else:
 395             dict = self.handle_error
 396             meth_name = proto + '_error'
 397             http_err = 0
 398         args = (dict, proto, meth_name) + args
 399         result = self._call_chain(*args)
 400         if result:
 401             return result
 402
 403         if http_err:
 404             args = (dict, 'default', 'http_error_default') + orig_args
 405             return self._call_chain(*args)
 406
 407 # XXX probably also want an abstract factory that knows when it makes
 408 # sense to skip a superclass in favor of a subclass and when it might
 409 # make sense to include both
 410
 411 def build_opener(*handlers):
 412     """Create an opener object from a list of handlers.
 413
 414     The opener will use several default handlers, including support
 415     for HTTP and FTP.
 416
 417     If any of the handlers passed as arguments are subclasses of the
 418     default handlers, the default handlers will not be used.
 419     """
 420
 421     opener = OpenerDirector()
 422     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 423                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 424                        FTPHandler, FileHandler, HTTPErrorProcessor]
 425     if hasattr(httplib, 'HTTPS'):
 426         default_classes.append(HTTPSHandler)
 427     skip = []
 428     for klass in default_classes:
 429         for check in handlers:
 430             if inspect.isclass(check):
 431                 if issubclass(check, klass):
 432                     skip.append(klass)
 433             elif isinstance(check, klass):
 434                 skip.append(klass)
 435     for klass in skip:
 436         default_classes.remove(klass)
 437
 438     for klass in default_classes:
 439         opener.add_handler(klass())
 440
 441     for h in handlers:
 442         if inspect.isclass(h):
 443             h = h()
 444         opener.add_handler(h)
 445     return opener
 446
 447 class BaseHandler:
 448     handler_order = 500
 449
 450     def add_parent(self, parent):
 451         self.parent = parent
 452
 453     def close(self):
 454         # Only exists for backwards compatibility
 455         pass
 456
 457     def __lt__(self, other):
 458         if not hasattr(other, "handler_order"):
 459             # Try to preserve the old behavior of having custom classes
 460             # inserted after default ones (works only for custom user
 461             # classes which are not aware of handler_order).
 462             return True
 463         return self.handler_order < other.handler_order
 464
 465
 466 class HTTPErrorProcessor(BaseHandler):
 467     """Process HTTP error responses."""
 468     handler_order = 1000  # after all other processing
 469
 470     def http_response(self, request, response):
 471         code, msg, hdrs = response.code, response.msg, response.info()
 472
 473         if code not in (200, 206):
 474             response = self.parent.error(
 475                 'http', request, response, code, msg, hdrs)
 476
 477         return response
 478
 479     https_response = http_response
 480
 481 class HTTPDefaultErrorHandler(BaseHandler):
 482     def http_error_default(self, req, fp, code, msg, hdrs):
 483         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 484
 485 class HTTPRedirectHandler(BaseHandler):
 486     # maximum number of redirections to any single URL
 487     # this is needed because of the state that cookies introduce
 488     max_repeats = 4
 489     # maximum total number of redirections (regardless of URL) before
 490     # assuming we're in a loop
 491     max_redirections = 10
 492
 493     def redirect_request(self, req, fp, code, msg, headers, newurl):
 494         """Return a Request or None in response to a redirect.
 495
 496         This is called by the http_error_30x methods when a
 497         redirection response is received.  If a redirection should
 498         take place, return a new Request to allow http_error_30x to
 499         perform the redirect.  Otherwise, raise HTTPError if no-one
 500         else should try to handle this url.  Return None if you can't
 501         but another Handler might.
 502         """
 503         m = req.get_method()
 504         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
 505             or code in (301, 302, 303) and m == "POST"):
 506             # Strictly (according to RFC 2616), 301 or 302 in response
 507             # to a POST MUST NOT cause a redirection without confirmation
 508             # from the user (of urllib2, in this case).  In practice,
 509             # essentially all clients do redirect in this case, so we
 510             # do the same.
 511             return Request(newurl,
 512                            headers=req.headers,
 513                            origin_req_host=req.get_origin_req_host(),
 514                            unverifiable=True)
 515         else:
 516             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 517
 518     # Implementation note: To avoid the server sending us into an
 519     # infinite loop, the request object needs to track what URLs we
 520     # have already seen.  Do this by adding a handler-specific
 521     # attribute to the Request object.
 522     def http_error_302(self, req, fp, code, msg, headers):
 523         # Some servers (incorrectly) return multiple Location headers
 524         # (so probably same goes for URI).  Use first header.
 525         if 'location' in headers:
 526             newurl = headers.getheaders('location')[0]
 527         elif 'uri' in headers:
 528             newurl = headers.getheaders('uri')[0]
 529         else:
 530             return
 531         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 532
 533         # XXX Probably want to forget about the state of the current
 534         # request, although that might interact poorly with other
 535         # handlers that also use handler-specific request attributes
 536         new = self.redirect_request(req, fp, code, msg, headers, newurl)
 537         if new is None:
 538             return
 539
 540         # loop detection
 541         # .redirect_dict has a key url if url was previously visited.
 542         if hasattr(req, 'redirect_dict'):
 543             visited = new.redirect_dict = req.redirect_dict
 544             if (visited.get(newurl, 0) >= self.max_repeats or
 545                 len(visited) >= self.max_redirections):
 546                 raise HTTPError(req.get_full_url(), code,
 547                                 self.inf_msg + msg, headers, fp)
 548         else:
 549             visited = new.redirect_dict = req.redirect_dict = {}
 550         visited[newurl] = visited.get(newurl, 0) + 1
 551
 552         # Don't close the fp until we are sure that we won't use it
 553         # with HTTPError.
 554         fp.read()
 555         fp.close()
 556
 557         return self.parent.open(new)
 558
 559     http_error_301 = http_error_303 = http_error_307 = http_error_302
 560
 561     inf_msg = "The HTTP server returned a redirect error that would " \
 562               "lead to an infinite loop.\n" \
 563               "The last 30x error message was:\n"
 564
 565 class ProxyHandler(BaseHandler):
 566     # Proxies must be in front
 567     handler_order = 100
 568
 569     def __init__(self, proxies=None):
 570         if proxies is None:
 571             proxies = getproxies()
 572         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 573         self.proxies = proxies
 574         for type, url in proxies.items():
 575             setattr(self, '%s_open' % type,
 576                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 577                     meth(r, proxy, type))
 578
 579     def proxy_open(self, req, proxy, type):
 580         orig_type = req.get_type()
 581         type, r_type = splittype(proxy)
 582         if not type or r_type.isdigit():
 583             # proxy is specified without protocol
 584             type = orig_type
 585             host = proxy
 586         else:
 587             host, r_host = splithost(r_type)
 588         user_pass, host = splituser(host)
 589         user, password = splitpasswd(user_pass)
 590         if user and password:
 591             user, password = user_pass.split(':', 1)
 592             user_pass = base64.encodestring('%s:%s' % (unquote(user),
 593                                             unquote(password))).strip()
 594             req.add_header('Proxy-authorization', 'Basic ' + user_pass)
 595         host = unquote(host)
 596         req.set_proxy(host, type)
 597         if orig_type == type:
 598             # let other handlers take care of it
 599             # XXX this only makes sense if the proxy is before the
 600             # other handlers
 601             return None
 602         else:
 603             # need to start over, because the other handlers don't
 604             # grok the proxy's URL type
 605             return self.parent.open(req)
 606
 607 # feature suggested by Duncan Booth
 608 # XXX custom is not a good name
 609 class CustomProxy:
 610     # either pass a function to the constructor or override handle
 611     def __init__(self, proto, func=None, proxy_addr=None):
 612         self.proto = proto
 613         self.func = func
 614         self.addr = proxy_addr
 615
 616     def handle(self, req):
 617         if self.func and self.func(req):
 618             return 1
 619
 620     def get_proxy(self):
 621         return self.addr
 622
 623 class CustomProxyHandler(BaseHandler):
 624     # Proxies must be in front
 625     handler_order = 100
 626
 627     def __init__(self, *proxies):
 628         self.proxies = {}
 629
 630     def proxy_open(self, req):
 631         proto = req.get_type()
 632         try:
 633             proxies = self.proxies[proto]
 634         except KeyError:
 635             return None
 636         for p in proxies:
 637             if p.handle(req):
 638                 req.set_proxy(p.get_proxy())
 639                 return self.parent.open(req)
 640         return None
 641
 642     def do_proxy(self, p, req):
 643         return self.parent.open(req)
 644
 645     def add_proxy(self, cpo):
 646         if cpo.proto in self.proxies:
 647             self.proxies[cpo.proto].append(cpo)
 648         else:
 649             self.proxies[cpo.proto] = [cpo]
 650
 651 class HTTPPasswordMgr:
 652     def __init__(self):
 653         self.passwd = {}
 654
 655     def add_password(self, realm, uri, user, passwd):
 656         # uri could be a single URI or a sequence
 657         if isinstance(uri, basestring):
 658             uri = [uri]
 659         uri = tuple(map(self.reduce_uri, uri))
 660         if not realm in self.passwd:
 661             self.passwd[realm] = {}
 662         self.passwd[realm][uri] = (user, passwd)
 663
 664     def find_user_password(self, realm, authuri):
 665         domains = self.passwd.get(realm, {})
 666         authuri = self.reduce_uri(authuri)
 667         for uris, authinfo in domains.iteritems():
 668             for uri in uris:
 669                 if self.is_suburi(uri, authuri):
 670                     return authinfo
 671         return None, None
 672
 673     def reduce_uri(self, uri):
 674         """Accept netloc or URI and extract only the netloc and path"""
 675         parts = urlparse.urlparse(uri)
 676         if parts[1]:
 677             return parts[1], parts[2] or '/'
 678         else:
 679             return parts[2], '/'
 680
 681     def is_suburi(self, base, test):
 682         """Check if test is below base in a URI tree
 683
 684         Both args must be URIs in reduced form.
 685         """
 686         if base == test:
 687             return True
 688         if base[0] != test[0]:
 689             return False
 690         common = posixpath.commonprefix((base[1], test[1]))
 691         if len(common) == len(base[1]):
 692             return True
 693         return False
 694
 695
 696 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 697
 698     def find_user_password(self, realm, authuri):
 699         user, password = HTTPPasswordMgr.find_user_password(self, realm,
 700                                                             authuri)
 701         if user is not None:
 702             return user, password
 703         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 704
 705
 706 class AbstractBasicAuthHandler:
 707
 708     rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
 709
 710     # XXX there can actually be multiple auth-schemes in a
 711     # www-authenticate header.  should probably be a lot more careful
 712     # in parsing them to extract multiple alternatives
 713
 714     def __init__(self, password_mgr=None):
 715         if password_mgr is None:
 716             password_mgr = HTTPPasswordMgr()
 717         self.passwd = password_mgr
 718         self.add_password = self.passwd.add_password
 719
 720     def http_error_auth_reqed(self, authreq, host, req, headers):
 721         # XXX could be multiple headers
 722         authreq = headers.get(authreq, None)
 723         if authreq:
 724             mo = AbstractBasicAuthHandler.rx.search(authreq)
 725             if mo:
 726                 scheme, realm = mo.groups()
 727                 if scheme.lower() == 'basic':
 728                     return self.retry_http_basic_auth(host, req, realm)
 729
 730     def retry_http_basic_auth(self, host, req, realm):
 731         # TODO(jhylton): Remove the host argument? It depends on whether
 732         # retry_http_basic_auth() is consider part of the public API.
 733         # It probably is.
 734         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
 735         if pw is not None:
 736             raw = "%s:%s" % (user, pw)
 737             auth = 'Basic %s' % base64.encodestring(raw).strip()
 738             if req.headers.get(self.auth_header, None) == auth:
 739                 return None
 740             req.add_header(self.auth_header, auth)
 741             return self.parent.open(req)
 742         else:
 743             return None
 744
 745 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 746
 747     auth_header = 'Authorization'
 748
 749     def http_error_401(self, req, fp, code, msg, headers):
 750         host = urlparse.urlparse(req.get_full_url())[1]
 751         return self.http_error_auth_reqed('www-authenticate',
 752                                           host, req, headers)
 753
 754
 755 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 756
 757     auth_header = 'Proxy-authorization'
 758
 759     def http_error_407(self, req, fp, code, msg, headers):
 760         host = req.get_host()
 761         return self.http_error_auth_reqed('proxy-authenticate',
 762                                           host, req, headers)
 763
 764
 765 def randombytes(n):
 766     """Return n random bytes."""
 767     # Use /dev/urandom if it is available.  Fall back to random module
 768     # if not.  It might be worthwhile to extend this function to use
 769     # other platform-specific mechanisms for getting random bytes.
 770     if os.path.exists("/dev/urandom"):
 771         f = open("/dev/urandom")
 772         s = f.read(n)
 773         f.close()
 774         return s
 775     else:
 776         L = [chr(random.randrange(0, 256)) for i in range(n)]
 777         return "".join(L)
 778
 779 class AbstractDigestAuthHandler:
 780     # Digest authentication is specified in RFC 2617.
 781
 782     # XXX The client does not inspect the Authentication-Info header
 783     # in a successful response.
 784
 785     # XXX It should be possible to test this implementation against
 786     # a mock server that just generates a static set of challenges.
 787
 788     # XXX qop="auth-int" supports is shaky
 789
 790     def __init__(self, passwd=None):
 791         if passwd is None:
 792             passwd = HTTPPasswordMgr()
 793         self.passwd = passwd
 794         self.add_password = self.passwd.add_password
 795         self.retried = 0
 796         self.nonce_count = 0
 797
 798     def reset_retry_count(self):
 799         self.retried = 0
 800
 801     def http_error_auth_reqed(self, auth_header, host, req, headers):
 802         authreq = headers.get(auth_header, None)
 803         if self.retried > 5:
 804             # Don't fail endlessly - if we failed once, we'll probably
 805             # fail a second time. Hm. Unless the Password Manager is
 806             # prompting for the information. Crap. This isn't great
 807             # but it's better than the current 'repeat until recursion
 808             # depth exceeded' approach <wink>
 809             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
 810                             headers, None)
 811         else:
 812             self.retried += 1
 813         if authreq:
 814             scheme = authreq.split()[0]
 815             if scheme.lower() == 'digest':
 816                 return self.retry_http_digest_auth(req, authreq)
 817             else:
 818                 raise ValueError("AbstractDigestAuthHandler doesn't know "
 819                                  "about %s"%(scheme))
 820
 821     def retry_http_digest_auth(self, req, auth):
 822         token, challenge = auth.split(' ', 1)
 823         chal = parse_keqv_list(parse_http_list(challenge))
 824         auth = self.get_authorization(req, chal)
 825         if auth:
 826             auth_val = 'Digest %s' % auth
 827             if req.headers.get(self.auth_header, None) == auth_val:
 828                 return None
 829             req.add_header(self.auth_header, auth_val)
 830             resp = self.parent.open(req)
 831             return resp
 832
 833     def get_cnonce(self, nonce):
 834         # The cnonce-value is an opaque
 835         # quoted string value provided by the client and used by both client
 836         # and server to avoid chosen plaintext attacks, to provide mutual
 837         # authentication, and to provide some message integrity protection.
 838         # This isn't a fabulous effort, but it's probably Good Enough.
 839         dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
 840                                        randombytes(8))).hexdigest()
 841         return dig[:16]
 842
 843     def get_authorization(self, req, chal):
 844         try:
 845             realm = chal['realm']
 846             nonce = chal['nonce']
 847             qop = chal.get('qop')
 848             algorithm = chal.get('algorithm', 'MD5')
 849             # mod_digest doesn't send an opaque, even though it isn't
 850             # supposed to be optional
 851             opaque = chal.get('opaque', None)
 852         except KeyError:
 853             return None
 854
 855         H, KD = self.get_algorithm_impls(algorithm)
 856         if H is None:
 857             return None
 858
 859         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
 860         if user is None:
 861             return None
 862
 863         # XXX not implemented yet
 864         if req.has_data():
 865             entdig = self.get_entity_digest(req.get_data(), chal)
 866         else:
 867             entdig = None
 868
 869         A1 = "%s:%s:%s" % (user, realm, pw)
 870         A2 = "%s:%s" % (req.get_method(),
 871                         # XXX selector: what about proxies and full urls
 872                         req.get_selector())
 873         if qop == 'auth':
 874             self.nonce_count += 1
 875             ncvalue = '%08x' % self.nonce_count
 876             cnonce = self.get_cnonce(nonce)
 877             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
 878             respdig = KD(H(A1), noncebit)
 879         elif qop is None:
 880             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 881         else:
 882             # XXX handle auth-int.
 883             pass
 884
 885         # XXX should the partial digests be encoded too?
 886
 887         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 888                'response="%s"' % (user, realm, nonce, req.get_selector(),
 889                                   respdig)
 890         if opaque:
 891             base += ', opaque="%s"' % opaque
 892         if entdig:
 893             base += ', digest="%s"' % entdig
 894         base += ', algorithm="%s"' % algorithm
 895         if qop:
 896             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
 897         return base
 898
 899     def get_algorithm_impls(self, algorithm):
 900         # lambdas assume digest modules are imported at the top level
 901         if algorithm == 'MD5':
 902             H = lambda x: md5.new(x).hexdigest()
 903         elif algorithm == 'SHA':
 904             H = lambda x: sha.new(x).hexdigest()
 905         # XXX MD5-sess
 906         KD = lambda s, d: H("%s:%s" % (s, d))
 907         return H, KD
 908
 909     def get_entity_digest(self, data, chal):
 910         # XXX not implemented yet
 911         return None
 912
 913
 914 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 915     """An authentication protocol defined by RFC 2069
 916
 917     Digest authentication improves on basic authentication because it
 918     does not transmit passwords in the clear.
 919     """
 920
 921     auth_header = 'Authorization'
 922
 923     def http_error_401(self, req, fp, code, msg, headers):
 924         host = urlparse.urlparse(req.get_full_url())[1]
 925         retry = self.http_error_auth_reqed('www-authenticate',
 926                                            host, req, headers)
 927         self.reset_retry_count()
 928         return retry
 929
 930
 931 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 932
 933     auth_header = 'Proxy-Authorization'
 934
 935     def http_error_407(self, req, fp, code, msg, headers):
 936         host = req.get_host()
 937         retry = self.http_error_auth_reqed('proxy-authenticate',
 938                                            host, req, headers)
 939         self.reset_retry_count()
 940         return retry
 941
 942 class AbstractHTTPHandler(BaseHandler):
 943
 944     def __init__(self, debuglevel=0):
 945         self._debuglevel = debuglevel
 946
 947     def set_http_debuglevel(self, level):
 948         self._debuglevel = level
 949
 950     def do_request_(self, request):
 951         host = request.get_host()
 952         if not host:
 953             raise URLError('no host given')
 954
 955         if request.has_data():  # POST
 956             data = request.get_data()
 957             if not request.has_header('Content-type'):
 958                 request.add_unredirected_header(
 959                     'Content-type',
 960                     'application/x-www-form-urlencoded')
 961             if not request.has_header('Content-length'):
 962                 request.add_unredirected_header(
 963                     'Content-length', '%d' % len(data))
 964
 965         scheme, sel = splittype(request.get_selector())
 966         sel_host, sel_path = splithost(sel)
 967         if not request.has_header('Host'):
 968             request.add_unredirected_header('Host', sel_host or host)
 969         for name, value in self.parent.addheaders:
 970             name = name.capitalize()
 971             if not request.has_header(name):
 972                 request.add_unredirected_header(name, value)
 973
 974         return request
 975
 976     def do_open(self, http_class, req):
 977         """Return an addinfourl object for the request, using http_class.
 978
 979         http_class must implement the HTTPConnection API from httplib.
 980         The addinfourl return value is a file-like object.  It also
 981         has methods and attributes including:
 982             - info(): return a mimetools.Message object for the headers
 983             - geturl(): return the original request URL
 984             - code: HTTP status code
 985         """
 986         host = req.get_host()
 987         if not host:
 988             raise URLError('no host given')
 989
 990         h = http_class(host) # will parse host:port
 991         h.set_debuglevel(self._debuglevel)
 992
 993         headers = dict(req.headers)
 994         headers.update(req.unredirected_hdrs)
 995         # We want to make an HTTP/1.1 request, but the addinfourl
 996         # class isn't prepared to deal with a persistent connection.
 997         # It will try to read all remaining data from the socket,
 998         # which will block while the server waits for the next request.
 999         # So make sure the connection gets closed after the (only)
1000         # request.
1001         headers["Connection"] = "close"
1002         try:
1003             h.request(req.get_method(), req.get_selector(), req.data, headers)
1004             r = h.getresponse()
1005         except socket.error, err: # XXX what error?
1006             raise URLError(err)
1007
1008         # Pick apart the HTTPResponse object to get the addinfourl
1009         # object initialized properly.
1010
1011         # Wrap the HTTPResponse object in socket's file object adapter
1012         # for Windows.  That adapter calls recv(), so delegate recv()
1013         # to read().  This weird wrapping allows the returned object to
1014         # have readline() and readlines() methods.
1015
1016         # XXX It might be better to extract the read buffering code
1017         # out of socket._fileobject() and into a base class.
1018
1019         r.recv = r.read
1020         fp = socket._fileobject(r)
1021
1022         resp = addinfourl(fp, r.msg, req.get_full_url())
1023         resp.code = r.status
1024         resp.msg = r.reason
1025         return resp
1026
1027
1028 class HTTPHandler(AbstractHTTPHandler):
1029
1030     def http_open(self, req):
1031         return self.do_open(httplib.HTTPConnection, req)
1032
1033     http_request = AbstractHTTPHandler.do_request_
1034
1035 if hasattr(httplib, 'HTTPS'):
1036     class HTTPSHandler(AbstractHTTPHandler):
1037
1038         def https_open(self, req):
1039             return self.do_open(httplib.HTTPSConnection, req)
1040
1041         https_request = AbstractHTTPHandler.do_request_
1042
1043 class HTTPCookieProcessor(BaseHandler):
1044     def __init__(self, cookiejar=None):
1045         if cookiejar is None:
1046             cookiejar = cookielib.CookieJar()
1047         self.cookiejar = cookiejar
1048
1049     def http_request(self, request):
1050         self.cookiejar.add_cookie_header(request)
1051         return request
1052
1053     def http_response(self, request, response):
1054         self.cookiejar.extract_cookies(response, request)
1055         return response
1056
1057     https_request = http_request
1058     https_response = http_response
1059
1060 class UnknownHandler(BaseHandler):
1061     def unknown_open(self, req):
1062         type = req.get_type()
1063         raise URLError('unknown url type: %s' % type)
1064
1065 def parse_keqv_list(l):
1066     """Parse list of key=value strings where keys are not duplicated."""
1067     parsed = {}
1068     for elt in l:
1069         k, v = elt.split('=', 1)
1070         if v[0] == '"' and v[-1] == '"':
1071             v = v[1:-1]
1072         parsed[k] = v
1073     return parsed
1074
1075 def parse_http_list(s):
1076     """Parse lists as described by RFC 2068 Section 2.
1077
1078     In particular, parse comma-separated lists where the elements of
1079     the list may include quoted-strings.  A quoted-string could
1080     contain a comma.  A non-quoted string could have quotes in the
1081     middle.  Neither commas nor quotes count if they are escaped.
1082     Only double-quotes count, not single-quotes.
1083     """
1084     res = []
1085     part = ''
1086
1087     escape = quote = False
1088     for cur in s:
1089         if escape:
1090             part += cur
1091             escape = False
1092             continue
1093         if quote:
1094             if cur == '\\':
1095                 escape = True
1096                 continue
1097             elif cur == '"':
1098                 quote = False
1099             part += cur
1100             continue
1101
1102         if cur == ',':
1103             res.append(part)
1104             part = ''
1105             continue
1106
1107         if cur == '"':
1108             quote = True
1109
1110         part += cur
1111
1112     # append last part
1113     if part:
1114         res.append(part)
1115
1116     return [part.strip() for part in res]
1117
1118 class FileHandler(BaseHandler):
1119     # Use local file or FTP depending on form of URL
1120     def file_open(self, req):
1121         url = req.get_selector()
1122         if url[:2] == '//' and url[2:3] != '/':
1123             req.type = 'ftp'
1124             return self.parent.open(req)
1125         else:
1126             return self.open_local_file(req)
1127
1128     # names for the localhost
1129     names = None
1130     def get_names(self):
1131         if FileHandler.names is None:
1132             FileHandler.names = (socket.gethostbyname('localhost'),
1133                                  socket.gethostbyname(socket.gethostname()))
1134         return FileHandler.names
1135
1136     # not entirely sure what the rules are here
1137     def open_local_file(self, req):
1138         import email.Utils
1139         host = req.get_host()
1140         file = req.get_selector()
1141         localfile = url2pathname(file)
1142         stats = os.stat(localfile)
1143         size = stats.st_size
1144         modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
1145         mtype = mimetypes.guess_type(file)[0]
1146         headers = mimetools.Message(StringIO(
1147             'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1148             (mtype or 'text/plain', size, modified)))
1149         if host:
1150             host, port = splitport(host)
1151         if not host or \
1152            (not port and socket.gethostbyname(host) in self.get_names()):
1153             return addinfourl(open(localfile, 'rb'),
1154                               headers, 'file:'+file)
1155         raise URLError('file not on local host')
1156
1157 class FTPHandler(BaseHandler):
1158     def ftp_open(self, req):
1159         host = req.get_host()
1160         if not host:
1161             raise IOError, ('ftp error', 'no host given')
1162         host, port = splitport(host)
1163         if port is None:
1164             port = ftplib.FTP_PORT
1165         else:
1166             port = int(port)
1167
1168         # username/password handling
1169         user, host = splituser(host)
1170         if user:
1171             user, passwd = splitpasswd(user)
1172         else:
1173             passwd = None
1174         host = unquote(host)
1175         user = unquote(user or '')
1176         passwd = unquote(passwd or '')
1177
1178         try:
1179             host = socket.gethostbyname(host)
1180         except socket.error, msg:
1181             raise URLError(msg)
1182         path, attrs = splitattr(req.get_selector())
1183         dirs = path.split('/')
1184         dirs = map(unquote, dirs)
1185         dirs, file = dirs[:-1], dirs[-1]
1186         if dirs and not dirs[0]:
1187             dirs = dirs[1:]
1188         try:
1189             fw = self.connect_ftp(user, passwd, host, port, dirs)
1190             type = file and 'I' or 'D'
1191             for attr in attrs:
1192                 attr, value = splitvalue(attr)
1193                 if attr.lower() == 'type' and \
1194                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
1195                     type = value.upper()
1196             fp, retrlen = fw.retrfile(file, type)
1197             headers = ""
1198             mtype = mimetypes.guess_type(req.get_full_url())[0]
1199             if mtype:
1200                 headers += "Content-type: %s\n" % mtype
1201             if retrlen is not None and retrlen >= 0:
1202                 headers += "Content-length: %d\n" % retrlen
1203             sf = StringIO(headers)
1204             headers = mimetools.Message(sf)
1205             return addinfourl(fp, headers, req.get_full_url())
1206         except ftplib.all_errors, msg:
1207             raise IOError, ('ftp error', msg), sys.exc_info()[2]
1208
1209     def connect_ftp(self, user, passwd, host, port, dirs):
1210         fw = ftpwrapper(user, passwd, host, port, dirs)
1211 ##        fw.ftp.set_debuglevel(1)
1212         return fw
1213
1214 class CacheFTPHandler(FTPHandler):
1215     # XXX would be nice to have pluggable cache strategies
1216     # XXX this stuff is definitely not thread safe
1217     def __init__(self):
1218         self.cache = {}
1219         self.timeout = {}
1220         self.soonest = 0
1221         self.delay = 60
1222         self.max_conns = 16
1223
1224     def setTimeout(self, t):
1225         self.delay = t
1226
1227     def setMaxConns(self, m):
1228         self.max_conns = m
1229
1230     def connect_ftp(self, user, passwd, host, port, dirs):
1231         key = user, host, port, '/'.join(dirs)
1232         if key in self.cache:
1233             self.timeout[key] = time.time() + self.delay
1234         else:
1235             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1236             self.timeout[key] = time.time() + self.delay
1237         self.check_cache()
1238         return self.cache[key]
1239
1240     def check_cache(self):
1241         # first check for old ones
1242         t = time.time()
1243         if self.soonest <= t:
1244             for k, v in self.timeout.items():
1245                 if v < t:
1246                     self.cache[k].close()
1247                     del self.cache[k]
1248                     del self.timeout[k]
1249         self.soonest = min(self.timeout.values())
1250
1251         # then check the size
1252         if len(self.cache) == self.max_conns:
1253             for k, v in self.timeout.items():
1254                 if v == self.soonest:
1255                     del self.cache[k]
1256                     del self.timeout[k]
1257                     break
1258             self.soonest = min(self.timeout.values())
1259
1260 class GopherHandler(BaseHandler):
1261     def gopher_open(self, req):
1262         host = req.get_host()
1263         if not host:
1264             raise GopherError('no host given')
1265         host = unquote(host)
1266         selector = req.get_selector()
1267         type, selector = splitgophertype(selector)
1268         selector, query = splitquery(selector)
1269         selector = unquote(selector)
1270         if query:
1271             query = unquote(query)
1272             fp = gopherlib.send_query(selector, query, host)
1273         else:
1274             fp = gopherlib.send_selector(selector, host)
1275         return addinfourl(fp, noheaders(), req.get_full_url())
1276
1277 #bleck! don't use this yet
1278 class OpenerFactory:
1279
1280     default_handlers = [UnknownHandler, HTTPHandler,
1281                         HTTPDefaultErrorHandler, HTTPRedirectHandler,
1282                         FTPHandler, FileHandler]
1283     handlers = []
1284     replacement_handlers = []
1285
1286     def add_handler(self, h):
1287         self.handlers = self.handlers + [h]
1288
1289     def replace_handler(self, h):
1290         pass
1291
1292     def build_opener(self):
1293         opener = OpenerDirector()
1294         for ph in self.default_handlers:
1295             if inspect.isclass(ph):
1296                 ph = ph()
1297             opener.add_handler(ph)
1298
1299 # Mapping status codes to official W3C names
1300 httpresponses = {
1301     100: 'Continue',
1302     101: 'Switching Protocols',
1303
1304     200: 'OK',
1305     201: 'Created',
1306     202: 'Accepted',
1307     203: 'Non-Authoritative Information',
1308     204: 'No Content',
1309     205: 'Reset Content',
1310     206: 'Partial Content',
1311
1312     300: 'Multiple Choices',
1313     301: 'Moved Permanently',
1314     302: 'Found',
1315     303: 'See Other',
1316     304: 'Not Modified',
1317     305: 'Use Proxy',
1318     306: '(Unused)',
1319     307: 'Temporary Redirect',
1320
1321     400: 'Bad Request',
1322     401: 'Unauthorized',
1323     402: 'Payment Required',
1324     403: 'Forbidden',
1325     404: 'Not Found',
1326     405: 'Method Not Allowed',
1327     406: 'Not Acceptable',
1328     407: 'Proxy Authentication Required',
1329     408: 'Request Timeout',
1330     409: 'Conflict',
1331     410: 'Gone',
1332     411: 'Length Required',
1333     412: 'Precondition Failed',
1334     413: 'Request Entity Too Large',
1335     414: 'Request-URI Too Long',
1336     415: 'Unsupported Media Type',
1337     416: 'Requested Range Not Satisfiable',
1338     417: 'Expectation Failed',
1339
1340     500: 'Internal Server Error',
1341     501: 'Not Implemented',
1342     502: 'Bad Gateway',
1343     503: 'Service Unavailable',
1344     504: 'Gateway Timeout',
1345     505: 'HTTP Version Not Supported',
1346 }