Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15 deals with digest authentication.
  16
  17 urlopen(url, data=None) -- Basic usage is the same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- Function that creates a new OpenerDirector instance.
  25 Will install the default handlers.  Accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  If one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- Installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- An object that encapsulates the state of a request.  The
  36 state can be as simple as the URL.  It can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError -- A subclass of IOError, individual protocols have their own
  43 specific subclass.
  44
  45 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response.
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password(realm='PDQ Application',
  59                       uri='https://mahler:8092/site-updates.py',
  60                       user='klem',
  61                       passwd='geheim$parole')
  62
  63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  64
  65 # build a new opener that adds authentication and caching FTP handlers
  66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  67
  68 # install it
  69 urllib2.install_opener(opener)
  70
  71 f = urllib2.urlopen('http://www.python.org/')
  72
  73
  74 """
  75
  76 # XXX issues:
  77 # If an authentication error handler that tries to perform
  78 # authentication for some reason but fails, how should the error be
  79 # signalled?  The client needs to know the HTTP error code.  But if
  80 # the handler knows that the problem was, e.g., that it didn't know
  81 # that hash algo that requested in the challenge, it would be good to
  82 # pass that information along to the client, too.
  83 # ftp errors aren't handled cleanly
  84 # check digest against correct (i.e. non-apache) implementation
  85
  86 # Possible extensions:
  87 # complex proxies  XXX not sure what exactly was meant by this
  88 # abstract factory for opener
  89
  90 import base64
  91 import hashlib
  92 import httplib
  93 import mimetools
  94 import os
  95 import posixpath
  96 import random
  97 import re
  98 import socket
  99 import sys
 100 import time
 101 import urlparse
 102 import bisect
 103
 104 try:
 105     from cStringIO import StringIO
 106 except ImportError:
 107     from StringIO import StringIO
 108
 109 from urllib import (unwrap, unquote, splittype, splithost, quote,
 110      addinfourl, splitport, splitquery,
 111      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
 112
 113 # support for FileHandler, proxies via environment variables
 114 from urllib import localhost, url2pathname, getproxies
 115
 116 # used in User-Agent header sent
 117 __version__ = sys.version[:3]
 118
 119 _opener = None
 120 def urlopen(url, data=None, timeout=None):
 121     global _opener
 122     if _opener is None:
 123         _opener = build_opener()
 124     return _opener.open(url, data, timeout)
 125
 126 def install_opener(opener):
 127     global _opener
 128     _opener = opener
 129
 130 # do these error classes make sense?
 131 # make sure all of the IOError stuff is overridden.  we just want to be
 132 # subtypes.
 133
 134 class URLError(IOError):
 135     # URLError is a sub-type of IOError, but it doesn't share any of
 136     # the implementation.  need to override __init__ and __str__.
 137     # It sets self.args for compatibility with other EnvironmentError
 138     # subclasses, but args doesn't have the typical format with errno in
 139     # slot 0 and strerror in slot 1.  This may be better than nothing.
 140     def __init__(self, reason):
 141         self.args = reason,
 142         self.reason = reason
 143
 144     def __str__(self):
 145         return '<urlopen error %s>' % self.reason
 146
 147 class HTTPError(URLError, addinfourl):
 148     """Raised when HTTP error occurs, but also acts like non-error return"""
 149     __super_init = addinfourl.__init__
 150
 151     def __init__(self, url, code, msg, hdrs, fp):
 152         self.code = code
 153         self.msg = msg
 154         self.hdrs = hdrs
 155         self.fp = fp
 156         self.filename = url
 157         # The addinfourl classes depend on fp being a valid file
 158         # object.  In some cases, the HTTPError may not have a valid
 159         # file object.  If this happens, the simplest workaround is to
 160         # not initialize the base classes.
 161         if fp is not None:
 162             self.__super_init(fp, hdrs, url)
 163
 164     def __str__(self):
 165         return 'HTTP Error %s: %s' % (self.code, self.msg)
 166
 167 # copied from cookielib.py
 168 _cut_port_re = re.compile(r":\d+$")
 169 def request_host(request):
 170     """Return request-host, as defined by RFC 2965.
 171
 172     Variation from RFC: returned value is lowercased, for convenient
 173     comparison.
 174
 175     """
 176     url = request.get_full_url()
 177     host = urlparse.urlparse(url)[1]
 178     if host == "":
 179         host = request.get_header("Host", "")
 180
 181     # remove port, if present
 182     host = _cut_port_re.sub("", host, 1)
 183     return host.lower()
 184
 185 class Request:
 186
 187     def __init__(self, url, data=None, headers={},
 188                  origin_req_host=None, unverifiable=False):
 189         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 190         self.__original = unwrap(url)
 191         self.type = None
 192         # self.__r_type is what's left after doing the splittype
 193         self.host = None
 194         self.port = None
 195         self.data = data
 196         self.headers = {}
 197         for key, value in headers.items():
 198             self.add_header(key, value)
 199         self.unredirected_hdrs = {}
 200         if origin_req_host is None:
 201             origin_req_host = request_host(self)
 202         self.origin_req_host = origin_req_host
 203         self.unverifiable = unverifiable
 204
 205     def __getattr__(self, attr):
 206         # XXX this is a fallback mechanism to guard against these
 207         # methods getting called in a non-standard order.  this may be
 208         # too complicated and/or unnecessary.
 209         # XXX should the __r_XXX attributes be public?
 210         if attr[:12] == '_Request__r_':
 211             name = attr[12:]
 212             if hasattr(Request, 'get_' + name):
 213                 getattr(self, 'get_' + name)()
 214                 return getattr(self, attr)
 215         raise AttributeError, attr
 216
 217     def get_method(self):
 218         if self.has_data():
 219             return "POST"
 220         else:
 221             return "GET"
 222
 223     # XXX these helper methods are lame
 224
 225     def add_data(self, data):
 226         self.data = data
 227
 228     def has_data(self):
 229         return self.data is not None
 230
 231     def get_data(self):
 232         return self.data
 233
 234     def get_full_url(self):
 235         return self.__original
 236
 237     def get_type(self):
 238         if self.type is None:
 239             self.type, self.__r_type = splittype(self.__original)
 240             if self.type is None:
 241                 raise ValueError, "unknown url type: %s" % self.__original
 242         return self.type
 243
 244     def get_host(self):
 245         if self.host is None:
 246             self.host, self.__r_host = splithost(self.__r_type)
 247             if self.host:
 248                 self.host = unquote(self.host)
 249         return self.host
 250
 251     def get_selector(self):
 252         return self.__r_host
 253
 254     def set_proxy(self, host, type):
 255         self.host, self.type = host, type
 256         self.__r_host = self.__original
 257
 258     def get_origin_req_host(self):
 259         return self.origin_req_host
 260
 261     def is_unverifiable(self):
 262         return self.unverifiable
 263
 264     def add_header(self, key, val):
 265         # useful for something like authentication
 266         self.headers[key.capitalize()] = val
 267
 268     def add_unredirected_header(self, key, val):
 269         # will not be added to a redirected request
 270         self.unredirected_hdrs[key.capitalize()] = val
 271
 272     def has_header(self, header_name):
 273         return (header_name in self.headers or
 274                 header_name in self.unredirected_hdrs)
 275
 276     def get_header(self, header_name, default=None):
 277         return self.headers.get(
 278             header_name,
 279             self.unredirected_hdrs.get(header_name, default))
 280
 281     def header_items(self):
 282         hdrs = self.unredirected_hdrs.copy()
 283         hdrs.update(self.headers)
 284         return hdrs.items()
 285
 286 class OpenerDirector:
 287     def __init__(self):
 288         client_version = "Python-urllib/%s" % __version__
 289         self.addheaders = [('User-agent', client_version)]
 290         # manage the individual handlers
 291         self.handlers = []
 292         self.handle_open = {}
 293         self.handle_error = {}
 294         self.process_response = {}
 295         self.process_request = {}
 296
 297     def add_handler(self, handler):
 298         if not hasattr(handler, "add_parent"):
 299             raise TypeError("expected BaseHandler instance, got %r" %
 300                             type(handler))
 301
 302         added = False
 303         for meth in dir(handler):
 304             if meth in ["redirect_request", "do_open", "proxy_open"]:
 305                 # oops, coincidental match
 306                 continue
 307
 308             i = meth.find("_")
 309             protocol = meth[:i]
 310             condition = meth[i+1:]
 311
 312             if condition.startswith("error"):
 313                 j = condition.find("_") + i + 1
 314                 kind = meth[j+1:]
 315                 try:
 316                     kind = int(kind)
 317                 except ValueError:
 318                     pass
 319                 lookup = self.handle_error.get(protocol, {})
 320                 self.handle_error[protocol] = lookup
 321             elif condition == "open":
 322                 kind = protocol
 323                 lookup = self.handle_open
 324             elif condition == "response":
 325                 kind = protocol
 326                 lookup = self.process_response
 327             elif condition == "request":
 328                 kind = protocol
 329                 lookup = self.process_request
 330             else:
 331                 continue
 332
 333             handlers = lookup.setdefault(kind, [])
 334             if handlers:
 335                 bisect.insort(handlers, handler)
 336             else:
 337                 handlers.append(handler)
 338             added = True
 339
 340         if added:
 341             # the handlers must work in an specific order, the order
 342             # is specified in a Handler attribute
 343             bisect.insort(self.handlers, handler)
 344             handler.add_parent(self)
 345
 346     def close(self):
 347         # Only exists for backwards compatibility.
 348         pass
 349
 350     def _call_chain(self, chain, kind, meth_name, *args):
 351         # Handlers raise an exception if no one else should try to handle
 352         # the request, or return None if they can't but another handler
 353         # could.  Otherwise, they return the response.
 354         handlers = chain.get(kind, ())
 355         for handler in handlers:
 356             func = getattr(handler, meth_name)
 357
 358             result = func(*args)
 359             if result is not None:
 360                 return result
 361
 362     def open(self, fullurl, data=None, timeout=None):
 363         # accept a URL or a Request object
 364         if isinstance(fullurl, basestring):
 365             req = Request(fullurl, data)
 366         else:
 367             req = fullurl
 368             if data is not None:
 369                 req.add_data(data)
 370
 371         req.timeout = timeout
 372         protocol = req.get_type()
 373
 374         # pre-process request
 375         meth_name = protocol+"_request"
 376         for processor in self.process_request.get(protocol, []):
 377             meth = getattr(processor, meth_name)
 378             req = meth(req)
 379
 380         response = self._open(req, data)
 381
 382         # post-process response
 383         meth_name = protocol+"_response"
 384         for processor in self.process_response.get(protocol, []):
 385             meth = getattr(processor, meth_name)
 386             response = meth(req, response)
 387
 388         return response
 389
 390     def _open(self, req, data=None):
 391         result = self._call_chain(self.handle_open, 'default',
 392                                   'default_open', req)
 393         if result:
 394             return result
 395
 396         protocol = req.get_type()
 397         result = self._call_chain(self.handle_open, protocol, protocol +
 398                                   '_open', req)
 399         if result:
 400             return result
 401
 402         return self._call_chain(self.handle_open, 'unknown',
 403                                 'unknown_open', req)
 404
 405     def error(self, proto, *args):
 406         if proto in ('http', 'https'):
 407             # XXX http[s] protocols are special-cased
 408             dict = self.handle_error['http'] # https is not different than http
 409             proto = args[2]  # YUCK!
 410             meth_name = 'http_error_%s' % proto
 411             http_err = 1
 412             orig_args = args
 413         else:
 414             dict = self.handle_error
 415             meth_name = proto + '_error'
 416             http_err = 0
 417         args = (dict, proto, meth_name) + args
 418         result = self._call_chain(*args)
 419         if result:
 420             return result
 421
 422         if http_err:
 423             args = (dict, 'default', 'http_error_default') + orig_args
 424             return self._call_chain(*args)
 425
 426 # XXX probably also want an abstract factory that knows when it makes
 427 # sense to skip a superclass in favor of a subclass and when it might
 428 # make sense to include both
 429
 430 def build_opener(*handlers):
 431     """Create an opener object from a list of handlers.
 432
 433     The opener will use several default handlers, including support
 434     for HTTP and FTP.
 435
 436     If any of the handlers passed as arguments are subclasses of the
 437     default handlers, the default handlers will not be used.
 438     """
 439     import types
 440     def isclass(obj):
 441         return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
 442
 443     opener = OpenerDirector()
 444     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 445                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 446                        FTPHandler, FileHandler, HTTPErrorProcessor]
 447     if hasattr(httplib, 'HTTPS'):
 448         default_classes.append(HTTPSHandler)
 449     skip = []
 450     for klass in default_classes:
 451         for check in handlers:
 452             if isclass(check):
 453                 if issubclass(check, klass):
 454                     skip.append(klass)
 455             elif isinstance(check, klass):
 456                 skip.append(klass)
 457     for klass in skip:
 458         default_classes.remove(klass)
 459
 460     for klass in default_classes:
 461         opener.add_handler(klass())
 462
 463     for h in handlers:
 464         if isclass(h):
 465             h = h()
 466         opener.add_handler(h)
 467     return opener
 468
 469 class BaseHandler:
 470     handler_order = 500
 471
 472     def add_parent(self, parent):
 473         self.parent = parent
 474
 475     def close(self):
 476         # Only exists for backwards compatibility
 477         pass
 478
 479     def __lt__(self, other):
 480         if not hasattr(other, "handler_order"):
 481             # Try to preserve the old behavior of having custom classes
 482             # inserted after default ones (works only for custom user
 483             # classes which are not aware of handler_order).
 484             return True
 485         return self.handler_order < other.handler_order
 486
 487
 488 class HTTPErrorProcessor(BaseHandler):
 489     """Process HTTP error responses."""
 490     handler_order = 1000  # after all other processing
 491
 492     def http_response(self, request, response):
 493         code, msg, hdrs = response.code, response.msg, response.info()
 494
 495         # According to RFC 2616, "2xx" code indicates that the client's
 496         # request was successfully received, understood, and accepted.
 497         if not (200 <= code < 300):
 498             response = self.parent.error(
 499                 'http', request, response, code, msg, hdrs)
 500
 501         return response
 502
 503     https_response = http_response
 504
 505 class HTTPDefaultErrorHandler(BaseHandler):
 506     def http_error_default(self, req, fp, code, msg, hdrs):
 507         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 508
 509 class HTTPRedirectHandler(BaseHandler):
 510     # maximum number of redirections to any single URL
 511     # this is needed because of the state that cookies introduce
 512     max_repeats = 4
 513     # maximum total number of redirections (regardless of URL) before
 514     # assuming we're in a loop
 515     max_redirections = 10
 516
 517     def redirect_request(self, req, fp, code, msg, headers, newurl):
 518         """Return a Request or None in response to a redirect.
 519
 520         This is called by the http_error_30x methods when a
 521         redirection response is received.  If a redirection should
 522         take place, return a new Request to allow http_error_30x to
 523         perform the redirect.  Otherwise, raise HTTPError if no-one
 524         else should try to handle this url.  Return None if you can't
 525         but another Handler might.
 526         """
 527         m = req.get_method()
 528         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
 529             or code in (301, 302, 303) and m == "POST"):
 530             # Strictly (according to RFC 2616), 301 or 302 in response
 531             # to a POST MUST NOT cause a redirection without confirmation
 532             # from the user (of urllib2, in this case).  In practice,
 533             # essentially all clients do redirect in this case, so we
 534             # do the same.
 535             # be conciliant with URIs containing a space
 536             newurl = newurl.replace(' ', '%20')
 537             return Request(newurl,
 538                            headers=req.headers,
 539                            origin_req_host=req.get_origin_req_host(),
 540                            unverifiable=True)
 541         else:
 542             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 543
 544     # Implementation note: To avoid the server sending us into an
 545     # infinite loop, the request object needs to track what URLs we
 546     # have already seen.  Do this by adding a handler-specific
 547     # attribute to the Request object.
 548     def http_error_302(self, req, fp, code, msg, headers):
 549         # Some servers (incorrectly) return multiple Location headers
 550         # (so probably same goes for URI).  Use first header.
 551         if 'location' in headers:
 552             newurl = headers.getheaders('location')[0]
 553         elif 'uri' in headers:
 554             newurl = headers.getheaders('uri')[0]
 555         else:
 556             return
 557         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 558
 559         # XXX Probably want to forget about the state of the current
 560         # request, although that might interact poorly with other
 561         # handlers that also use handler-specific request attributes
 562         new = self.redirect_request(req, fp, code, msg, headers, newurl)
 563         if new is None:
 564             return
 565
 566         # loop detection
 567         # .redirect_dict has a key url if url was previously visited.
 568         if hasattr(req, 'redirect_dict'):
 569             visited = new.redirect_dict = req.redirect_dict
 570             if (visited.get(newurl, 0) >= self.max_repeats or
 571                 len(visited) >= self.max_redirections):
 572                 raise HTTPError(req.get_full_url(), code,
 573                                 self.inf_msg + msg, headers, fp)
 574         else:
 575             visited = new.redirect_dict = req.redirect_dict = {}
 576         visited[newurl] = visited.get(newurl, 0) + 1
 577
 578         # Don't close the fp until we are sure that we won't use it
 579         # with HTTPError.
 580         fp.read()
 581         fp.close()
 582
 583         return self.parent.open(new)
 584
 585     http_error_301 = http_error_303 = http_error_307 = http_error_302
 586
 587     inf_msg = "The HTTP server returned a redirect error that would " \
 588               "lead to an infinite loop.\n" \
 589               "The last 30x error message was:\n"
 590
 591
 592 def _parse_proxy(proxy):
 593     """Return (scheme, user, password, host/port) given a URL or an authority.
 594
 595     If a URL is supplied, it must have an authority (host:port) component.
 596     According to RFC 3986, having an authority component means the URL must
 597     have two slashes after the scheme:
 598
 599     >>> _parse_proxy('file:/ftp.example.com/')
 600     Traceback (most recent call last):
 601     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
 602
 603     The first three items of the returned tuple may be None.
 604
 605     Examples of authority parsing:
 606
 607     >>> _parse_proxy('proxy.example.com')
 608     (None, None, None, 'proxy.example.com')
 609     >>> _parse_proxy('proxy.example.com:3128')
 610     (None, None, None, 'proxy.example.com:3128')
 611
 612     The authority component may optionally include userinfo (assumed to be
 613     username:password):
 614
 615     >>> _parse_proxy('joe:password@proxy.example.com')
 616     (None, 'joe', 'password', 'proxy.example.com')
 617     >>> _parse_proxy('joe:password@proxy.example.com:3128')
 618     (None, 'joe', 'password', 'proxy.example.com:3128')
 619
 620     Same examples, but with URLs instead:
 621
 622     >>> _parse_proxy('http://proxy.example.com/')
 623     ('http', None, None, 'proxy.example.com')
 624     >>> _parse_proxy('http://proxy.example.com:3128/')
 625     ('http', None, None, 'proxy.example.com:3128')
 626     >>> _parse_proxy('http://joe:password@proxy.example.com/')
 627     ('http', 'joe', 'password', 'proxy.example.com')
 628     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
 629     ('http', 'joe', 'password', 'proxy.example.com:3128')
 630
 631     Everything after the authority is ignored:
 632
 633     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
 634     ('ftp', 'joe', 'password', 'proxy.example.com')
 635
 636     Test for no trailing '/' case:
 637
 638     >>> _parse_proxy('http://joe:password@proxy.example.com')
 639     ('http', 'joe', 'password', 'proxy.example.com')
 640
 641     """
 642     scheme, r_scheme = splittype(proxy)
 643     if not r_scheme.startswith("/"):
 644         # authority
 645         scheme = None
 646         authority = proxy
 647     else:
 648         # URL
 649         if not r_scheme.startswith("//"):
 650             raise ValueError("proxy URL with no authority: %r" % proxy)
 651         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
 652         # and 3.3.), path is empty or starts with '/'
 653         end = r_scheme.find("/", 2)
 654         if end == -1:
 655             end = None
 656         authority = r_scheme[2:end]
 657     userinfo, hostport = splituser(authority)
 658     if userinfo is not None:
 659         user, password = splitpasswd(userinfo)
 660     else:
 661         user = password = None
 662     return scheme, user, password, hostport
 663
 664 class ProxyHandler(BaseHandler):
 665     # Proxies must be in front
 666     handler_order = 100
 667
 668     def __init__(self, proxies=None):
 669         if proxies is None:
 670             proxies = getproxies()
 671         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 672         self.proxies = proxies
 673         for type, url in proxies.items():
 674             setattr(self, '%s_open' % type,
 675                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 676                     meth(r, proxy, type))
 677
 678     def proxy_open(self, req, proxy, type):
 679         orig_type = req.get_type()
 680         proxy_type, user, password, hostport = _parse_proxy(proxy)
 681         if proxy_type is None:
 682             proxy_type = orig_type
 683         if user and password:
 684             user_pass = '%s:%s' % (unquote(user), unquote(password))
 685             creds = base64.b64encode(user_pass).strip()
 686             req.add_header('Proxy-authorization', 'Basic ' + creds)
 687         hostport = unquote(hostport)
 688         req.set_proxy(hostport, proxy_type)
 689         if orig_type == proxy_type:
 690             # let other handlers take care of it
 691             return None
 692         else:
 693             # need to start over, because the other handlers don't
 694             # grok the proxy's URL type
 695             # e.g. if we have a constructor arg proxies like so:
 696             # {'http': 'ftp://proxy.example.com'}, we may end up turning
 697             # a request for http://acme.example.com/a into one for
 698             # ftp://proxy.example.com/a
 699             return self.parent.open(req)
 700
 701 class HTTPPasswordMgr:
 702
 703     def __init__(self):
 704         self.passwd = {}
 705
 706     def add_password(self, realm, uri, user, passwd):
 707         # uri could be a single URI or a sequence
 708         if isinstance(uri, basestring):
 709             uri = [uri]
 710         if not realm in self.passwd:
 711             self.passwd[realm] = {}
 712         for default_port in True, False:
 713             reduced_uri = tuple(
 714                 [self.reduce_uri(u, default_port) for u in uri])
 715             self.passwd[realm][reduced_uri] = (user, passwd)
 716
 717     def find_user_password(self, realm, authuri):
 718         domains = self.passwd.get(realm, {})
 719         for default_port in True, False:
 720             reduced_authuri = self.reduce_uri(authuri, default_port)
 721             for uris, authinfo in domains.iteritems():
 722                 for uri in uris:
 723                     if self.is_suburi(uri, reduced_authuri):
 724                         return authinfo
 725         return None, None
 726
 727     def reduce_uri(self, uri, default_port=True):
 728         """Accept authority or URI and extract only the authority and path."""
 729         # note HTTP URLs do not have a userinfo component
 730         parts = urlparse.urlsplit(uri)
 731         if parts[1]:
 732             # URI
 733             scheme = parts[0]
 734             authority = parts[1]
 735             path = parts[2] or '/'
 736         else:
 737             # host or host:port
 738             scheme = None
 739             authority = uri
 740             path = '/'
 741         host, port = splitport(authority)
 742         if default_port and port is None and scheme is not None:
 743             dport = {"http": 80,
 744                      "https": 443,
 745                      }.get(scheme)
 746             if dport is not None:
 747                 authority = "%s:%d" % (host, dport)
 748         return authority, path
 749
 750     def is_suburi(self, base, test):
 751         """Check if test is below base in a URI tree
 752
 753         Both args must be URIs in reduced form.
 754         """
 755         if base == test:
 756             return True
 757         if base[0] != test[0]:
 758             return False
 759         common = posixpath.commonprefix((base[1], test[1]))
 760         if len(common) == len(base[1]):
 761             return True
 762         return False
 763
 764
 765 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 766
 767     def find_user_password(self, realm, authuri):
 768         user, password = HTTPPasswordMgr.find_user_password(self, realm,
 769                                                             authuri)
 770         if user is not None:
 771             return user, password
 772         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 773
 774
 775 class AbstractBasicAuthHandler:
 776
 777     # XXX this allows for multiple auth-schemes, but will stupidly pick
 778     # the last one with a realm specified.
 779
 780     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
 781
 782     # XXX could pre-emptively send auth info already accepted (RFC 2617,
 783     # end of section 2, and section 1.2 immediately after "credentials"
 784     # production).
 785
 786     def __init__(self, password_mgr=None):
 787         if password_mgr is None:
 788             password_mgr = HTTPPasswordMgr()
 789         self.passwd = password_mgr
 790         self.add_password = self.passwd.add_password
 791
 792     def http_error_auth_reqed(self, authreq, host, req, headers):
 793         # host may be an authority (without userinfo) or a URL with an
 794         # authority
 795         # XXX could be multiple headers
 796         authreq = headers.get(authreq, None)
 797         if authreq:
 798             mo = AbstractBasicAuthHandler.rx.search(authreq)
 799             if mo:
 800                 scheme, realm = mo.groups()
 801                 if scheme.lower() == 'basic':
 802                     return self.retry_http_basic_auth(host, req, realm)
 803
 804     def retry_http_basic_auth(self, host, req, realm):
 805         user, pw = self.passwd.find_user_password(realm, host)
 806         if pw is not None:
 807             raw = "%s:%s" % (user, pw)
 808             auth = 'Basic %s' % base64.b64encode(raw).strip()
 809             if req.headers.get(self.auth_header, None) == auth:
 810                 return None
 811             req.add_header(self.auth_header, auth)
 812             return self.parent.open(req)
 813         else:
 814             return None
 815
 816
 817 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 818
 819     auth_header = 'Authorization'
 820
 821     def http_error_401(self, req, fp, code, msg, headers):
 822         url = req.get_full_url()
 823         return self.http_error_auth_reqed('www-authenticate',
 824                                           url, req, headers)
 825
 826
 827 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 828
 829     auth_header = 'Proxy-authorization'
 830
 831     def http_error_407(self, req, fp, code, msg, headers):
 832         # http_error_auth_reqed requires that there is no userinfo component in
 833         # authority.  Assume there isn't one, since urllib2 does not (and
 834         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
 835         # userinfo.
 836         authority = req.get_host()
 837         return self.http_error_auth_reqed('proxy-authenticate',
 838                                           authority, req, headers)
 839
 840
 841 def randombytes(n):
 842     """Return n random bytes."""
 843     # Use /dev/urandom if it is available.  Fall back to random module
 844     # if not.  It might be worthwhile to extend this function to use
 845     # other platform-specific mechanisms for getting random bytes.
 846     if os.path.exists("/dev/urandom"):
 847         f = open("/dev/urandom")
 848         s = f.read(n)
 849         f.close()
 850         return s
 851     else:
 852         L = [chr(random.randrange(0, 256)) for i in range(n)]
 853         return "".join(L)
 854
 855 class AbstractDigestAuthHandler:
 856     # Digest authentication is specified in RFC 2617.
 857
 858     # XXX The client does not inspect the Authentication-Info header
 859     # in a successful response.
 860
 861     # XXX It should be possible to test this implementation against
 862     # a mock server that just generates a static set of challenges.
 863
 864     # XXX qop="auth-int" supports is shaky
 865
 866     def __init__(self, passwd=None):
 867         if passwd is None:
 868             passwd = HTTPPasswordMgr()
 869         self.passwd = passwd
 870         self.add_password = self.passwd.add_password
 871         self.retried = 0
 872         self.nonce_count = 0
 873
 874     def reset_retry_count(self):
 875         self.retried = 0
 876
 877     def http_error_auth_reqed(self, auth_header, host, req, headers):
 878         authreq = headers.get(auth_header, None)
 879         if self.retried > 5:
 880             # Don't fail endlessly - if we failed once, we'll probably
 881             # fail a second time. Hm. Unless the Password Manager is
 882             # prompting for the information. Crap. This isn't great
 883             # but it's better than the current 'repeat until recursion
 884             # depth exceeded' approach <wink>
 885             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
 886                             headers, None)
 887         else:
 888             self.retried += 1
 889         if authreq:
 890             scheme = authreq.split()[0]
 891             if scheme.lower() == 'digest':
 892                 return self.retry_http_digest_auth(req, authreq)
 893
 894     def retry_http_digest_auth(self, req, auth):
 895         token, challenge = auth.split(' ', 1)
 896         chal = parse_keqv_list(parse_http_list(challenge))
 897         auth = self.get_authorization(req, chal)
 898         if auth:
 899             auth_val = 'Digest %s' % auth
 900             if req.headers.get(self.auth_header, None) == auth_val:
 901                 return None
 902             req.add_unredirected_header(self.auth_header, auth_val)
 903             resp = self.parent.open(req)
 904             return resp
 905
 906     def get_cnonce(self, nonce):
 907         # The cnonce-value is an opaque
 908         # quoted string value provided by the client and used by both client
 909         # and server to avoid chosen plaintext attacks, to provide mutual
 910         # authentication, and to provide some message integrity protection.
 911         # This isn't a fabulous effort, but it's probably Good Enough.
 912         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
 913                                             randombytes(8))).hexdigest()
 914         return dig[:16]
 915
 916     def get_authorization(self, req, chal):
 917         try:
 918             realm = chal['realm']
 919             nonce = chal['nonce']
 920             qop = chal.get('qop')
 921             algorithm = chal.get('algorithm', 'MD5')
 922             # mod_digest doesn't send an opaque, even though it isn't
 923             # supposed to be optional
 924             opaque = chal.get('opaque', None)
 925         except KeyError:
 926             return None
 927
 928         H, KD = self.get_algorithm_impls(algorithm)
 929         if H is None:
 930             return None
 931
 932         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
 933         if user is None:
 934             return None
 935
 936         # XXX not implemented yet
 937         if req.has_data():
 938             entdig = self.get_entity_digest(req.get_data(), chal)
 939         else:
 940             entdig = None
 941
 942         A1 = "%s:%s:%s" % (user, realm, pw)
 943         A2 = "%s:%s" % (req.get_method(),
 944                         # XXX selector: what about proxies and full urls
 945                         req.get_selector())
 946         if qop == 'auth':
 947             self.nonce_count += 1
 948             ncvalue = '%08x' % self.nonce_count
 949             cnonce = self.get_cnonce(nonce)
 950             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
 951             respdig = KD(H(A1), noncebit)
 952         elif qop is None:
 953             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 954         else:
 955             # XXX handle auth-int.
 956             raise URLError("qop '%s' is not supported." % qop)
 957
 958         # XXX should the partial digests be encoded too?
 959
 960         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 961                'response="%s"' % (user, realm, nonce, req.get_selector(),
 962                                   respdig)
 963         if opaque:
 964             base += ', opaque="%s"' % opaque
 965         if entdig:
 966             base += ', digest="%s"' % entdig
 967         base += ', algorithm="%s"' % algorithm
 968         if qop:
 969             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
 970         return base
 971
 972     def get_algorithm_impls(self, algorithm):
 973         # lambdas assume digest modules are imported at the top level
 974         if algorithm == 'MD5':
 975             H = lambda x: hashlib.md5(x).hexdigest()
 976         elif algorithm == 'SHA':
 977             H = lambda x: hashlib.sha1(x).hexdigest()
 978         # XXX MD5-sess
 979         KD = lambda s, d: H("%s:%s" % (s, d))
 980         return H, KD
 981
 982     def get_entity_digest(self, data, chal):
 983         # XXX not implemented yet
 984         return None
 985
 986
 987 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 988     """An authentication protocol defined by RFC 2069
 989
 990     Digest authentication improves on basic authentication because it
 991     does not transmit passwords in the clear.
 992     """
 993
 994     auth_header = 'Authorization'
 995     handler_order = 490  # before Basic auth
 996
 997     def http_error_401(self, req, fp, code, msg, headers):
 998         host = urlparse.urlparse(req.get_full_url())[1]
 999         retry = self.http_error_auth_reqed('www-authenticate',
1000                                            host, req, headers)
1001         self.reset_retry_count()
1002         return retry
1003
1004
1005 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1006
1007     auth_header = 'Proxy-Authorization'
1008     handler_order = 490  # before Basic auth
1009
1010     def http_error_407(self, req, fp, code, msg, headers):
1011         host = req.get_host()
1012         retry = self.http_error_auth_reqed('proxy-authenticate',
1013                                            host, req, headers)
1014         self.reset_retry_count()
1015         return retry
1016
1017 class AbstractHTTPHandler(BaseHandler):
1018
1019     def __init__(self, debuglevel=0):
1020         self._debuglevel = debuglevel
1021
1022     def set_http_debuglevel(self, level):
1023         self._debuglevel = level
1024
1025     def do_request_(self, request):
1026         host = request.get_host()
1027         if not host:
1028             raise URLError('no host given')
1029
1030         if request.has_data():  # POST
1031             data = request.get_data()
1032             if not request.has_header('Content-type'):
1033                 request.add_unredirected_header(
1034                     'Content-type',
1035                     'application/x-www-form-urlencoded')
1036             if not request.has_header('Content-length'):
1037                 request.add_unredirected_header(
1038                     'Content-length', '%d' % len(data))
1039
1040         scheme, sel = splittype(request.get_selector())
1041         sel_host, sel_path = splithost(sel)
1042         if not request.has_header('Host'):
1043             request.add_unredirected_header('Host', sel_host or host)
1044         for name, value in self.parent.addheaders:
1045             name = name.capitalize()
1046             if not request.has_header(name):
1047                 request.add_unredirected_header(name, value)
1048
1049         return request
1050
1051     def do_open(self, http_class, req):
1052         """Return an addinfourl object for the request, using http_class.
1053
1054         http_class must implement the HTTPConnection API from httplib.
1055         The addinfourl return value is a file-like object.  It also
1056         has methods and attributes including:
1057             - info(): return a mimetools.Message object for the headers
1058             - geturl(): return the original request URL
1059             - code: HTTP status code
1060         """
1061         host = req.get_host()
1062         if not host:
1063             raise URLError('no host given')
1064
1065         h = http_class(host, timeout=req.timeout) # will parse host:port
1066         h.set_debuglevel(self._debuglevel)
1067
1068         headers = dict(req.headers)
1069         headers.update(req.unredirected_hdrs)
1070         # We want to make an HTTP/1.1 request, but the addinfourl
1071         # class isn't prepared to deal with a persistent connection.
1072         # It will try to read all remaining data from the socket,
1073         # which will block while the server waits for the next request.
1074         # So make sure the connection gets closed after the (only)
1075         # request.
1076         headers["Connection"] = "close"
1077         headers = dict(
1078             (name.title(), val) for name, val in headers.items())
1079         try:
1080             h.request(req.get_method(), req.get_selector(), req.data, headers)
1081             r = h.getresponse()
1082         except socket.error, err: # XXX what error?
1083             raise URLError(err)
1084
1085         # Pick apart the HTTPResponse object to get the addinfourl
1086         # object initialized properly.
1087
1088         # Wrap the HTTPResponse object in socket's file object adapter
1089         # for Windows.  That adapter calls recv(), so delegate recv()
1090         # to read().  This weird wrapping allows the returned object to
1091         # have readline() and readlines() methods.
1092
1093         # XXX It might be better to extract the read buffering code
1094         # out of socket._fileobject() and into a base class.
1095
1096         r.recv = r.read
1097         fp = socket._fileobject(r, close=True)
1098
1099         resp = addinfourl(fp, r.msg, req.get_full_url())
1100         resp.code = r.status
1101         resp.msg = r.reason
1102         return resp
1103
1104
1105 class HTTPHandler(AbstractHTTPHandler):
1106
1107     def http_open(self, req):
1108         return self.do_open(httplib.HTTPConnection, req)
1109
1110     http_request = AbstractHTTPHandler.do_request_
1111
1112 if hasattr(httplib, 'HTTPS'):
1113     class HTTPSHandler(AbstractHTTPHandler):
1114
1115         def https_open(self, req):
1116             return self.do_open(httplib.HTTPSConnection, req)
1117
1118         https_request = AbstractHTTPHandler.do_request_
1119
1120 class HTTPCookieProcessor(BaseHandler):
1121     def __init__(self, cookiejar=None):
1122         import cookielib
1123         if cookiejar is None:
1124             cookiejar = cookielib.CookieJar()
1125         self.cookiejar = cookiejar
1126
1127     def http_request(self, request):
1128         self.cookiejar.add_cookie_header(request)
1129         return request
1130
1131     def http_response(self, request, response):
1132         self.cookiejar.extract_cookies(response, request)
1133         return response
1134
1135     https_request = http_request
1136     https_response = http_response
1137
1138 class UnknownHandler(BaseHandler):
1139     def unknown_open(self, req):
1140         type = req.get_type()
1141         raise URLError('unknown url type: %s' % type)
1142
1143 def parse_keqv_list(l):
1144     """Parse list of key=value strings where keys are not duplicated."""
1145     parsed = {}
1146     for elt in l:
1147         k, v = elt.split('=', 1)
1148         if v[0] == '"' and v[-1] == '"':
1149             v = v[1:-1]
1150         parsed[k] = v
1151     return parsed
1152
1153 def parse_http_list(s):
1154     """Parse lists as described by RFC 2068 Section 2.
1155
1156     In particular, parse comma-separated lists where the elements of
1157     the list may include quoted-strings.  A quoted-string could
1158     contain a comma.  A non-quoted string could have quotes in the
1159     middle.  Neither commas nor quotes count if they are escaped.
1160     Only double-quotes count, not single-quotes.
1161     """
1162     res = []
1163     part = ''
1164
1165     escape = quote = False
1166     for cur in s:
1167         if escape:
1168             part += cur
1169             escape = False
1170             continue
1171         if quote:
1172             if cur == '\\':
1173                 escape = True
1174                 continue
1175             elif cur == '"':
1176                 quote = False
1177             part += cur
1178             continue
1179
1180         if cur == ',':
1181             res.append(part)
1182             part = ''
1183             continue
1184
1185         if cur == '"':
1186             quote = True
1187
1188         part += cur
1189
1190     # append last part
1191     if part:
1192         res.append(part)
1193
1194     return [part.strip() for part in res]
1195
1196 class FileHandler(BaseHandler):
1197     # Use local file or FTP depending on form of URL
1198     def file_open(self, req):
1199         url = req.get_selector()
1200         if url[:2] == '//' and url[2:3] != '/':
1201             req.type = 'ftp'
1202             return self.parent.open(req)
1203         else:
1204             return self.open_local_file(req)
1205
1206     # names for the localhost
1207     names = None
1208     def get_names(self):
1209         if FileHandler.names is None:
1210             try:
1211                 FileHandler.names = (socket.gethostbyname('localhost'),
1212                                     socket.gethostbyname(socket.gethostname()))
1213             except socket.gaierror:
1214                 FileHandler.names = (socket.gethostbyname('localhost'),)
1215         return FileHandler.names
1216
1217     # not entirely sure what the rules are here
1218     def open_local_file(self, req):
1219         import email.utils
1220         import mimetypes
1221         host = req.get_host()
1222         file = req.get_selector()
1223         localfile = url2pathname(file)
1224         try:
1225             stats = os.stat(localfile)
1226             size = stats.st_size
1227             modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
1228             mtype = mimetypes.guess_type(file)[0]
1229             headers = mimetools.Message(StringIO(
1230                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1231                 (mtype or 'text/plain', size, modified)))
1232             if host:
1233                 host, port = splitport(host)
1234             if not host or \
1235                 (not port and socket.gethostbyname(host) in self.get_names()):
1236                 return addinfourl(open(localfile, 'rb'),
1237                                   headers, 'file:'+file)
1238         except OSError, msg:
1239             # urllib2 users shouldn't expect OSErrors coming from urlopen()
1240             raise URLError(msg)
1241         raise URLError('file not on local host')
1242
1243 class FTPHandler(BaseHandler):
1244     def ftp_open(self, req):
1245         import ftplib
1246         import mimetypes
1247         host = req.get_host()
1248         if not host:
1249             raise URLError, ('ftp error', 'no host given')
1250         host, port = splitport(host)
1251         if port is None:
1252             port = ftplib.FTP_PORT
1253         else:
1254             port = int(port)
1255
1256         # username/password handling
1257         user, host = splituser(host)
1258         if user:
1259             user, passwd = splitpasswd(user)
1260         else:
1261             passwd = None
1262         host = unquote(host)
1263         user = unquote(user or '')
1264         passwd = unquote(passwd or '')
1265
1266         try:
1267             host = socket.gethostbyname(host)
1268         except socket.error, msg:
1269             raise URLError(msg)
1270         path, attrs = splitattr(req.get_selector())
1271         dirs = path.split('/')
1272         dirs = map(unquote, dirs)
1273         dirs, file = dirs[:-1], dirs[-1]
1274         if dirs and not dirs[0]:
1275             dirs = dirs[1:]
1276         try:
1277             fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
1278             type = file and 'I' or 'D'
1279             for attr in attrs:
1280                 attr, value = splitvalue(attr)
1281                 if attr.lower() == 'type' and \
1282                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
1283                     type = value.upper()
1284             fp, retrlen = fw.retrfile(file, type)
1285             headers = ""
1286             mtype = mimetypes.guess_type(req.get_full_url())[0]
1287             if mtype:
1288                 headers += "Content-type: %s\n" % mtype
1289             if retrlen is not None and retrlen >= 0:
1290                 headers += "Content-length: %d\n" % retrlen
1291             sf = StringIO(headers)
1292             headers = mimetools.Message(sf)
1293             return addinfourl(fp, headers, req.get_full_url())
1294         except ftplib.all_errors, msg:
1295             raise URLError, ('ftp error', msg), sys.exc_info()[2]
1296
1297     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1298         fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
1299 ##        fw.ftp.set_debuglevel(1)
1300         return fw
1301
1302 class CacheFTPHandler(FTPHandler):
1303     # XXX would be nice to have pluggable cache strategies
1304     # XXX this stuff is definitely not thread safe
1305     def __init__(self):
1306         self.cache = {}
1307         self.timeout = {}
1308         self.soonest = 0
1309         self.delay = 60
1310         self.max_conns = 16
1311
1312     def setTimeout(self, t):
1313         self.delay = t
1314
1315     def setMaxConns(self, m):
1316         self.max_conns = m
1317
1318     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
1319         key = user, host, port, '/'.join(dirs), timeout
1320         if key in self.cache:
1321             self.timeout[key] = time.time() + self.delay
1322         else:
1323             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
1324             self.timeout[key] = time.time() + self.delay
1325         self.check_cache()
1326         return self.cache[key]
1327
1328     def check_cache(self):
1329         # first check for old ones
1330         t = time.time()
1331         if self.soonest <= t:
1332             for k, v in self.timeout.items():
1333                 if v < t:
1334                     self.cache[k].close()
1335                     del self.cache[k]
1336                     del self.timeout[k]
1337         self.soonest = min(self.timeout.values())
1338
1339         # then check the size
1340         if len(self.cache) == self.max_conns:
1341             for k, v in self.timeout.items():
1342                 if v == self.soonest:
1343                     del self.cache[k]
1344                     del self.timeout[k]
1345                     break
1346             self.soonest = min(self.timeout.values())