Lib/urllib2.py

   1 """An extensible library for opening URLs using a variety of protocols
   2
   3 The simplest way to use this module is to call the urlopen function,
   4 which accepts a string containing a URL or a Request object (described
   5 below).  It opens the URL and returns the results as file-like
   6 object; the returned object has some extra methods described below.
   7
   8 The OpenerDirector manages a collection of Handler objects that do
   9 all the actual work.  Each Handler implements a particular protocol or
  10 option.  The OpenerDirector is a composite object that invokes the
  11 Handlers needed to open the requested URL.  For example, the
  12 HTTPHandler performs HTTP GET and POST requests and deals with
  13 non-error returns.  The HTTPRedirectHandler automatically deals with
  14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
  15 deals with digest authentication.
  16
  17 urlopen(url, data=None) -- basic usage is the same as original
  18 urllib.  pass the url and optionally data to post to an HTTP URL, and
  19 get a file-like object back.  One difference is that you can also pass
  20 a Request instance instead of URL.  Raises a URLError (subclass of
  21 IOError); for HTTP errors, raises an HTTPError, which can also be
  22 treated as a valid response.
  23
  24 build_opener -- function that creates a new OpenerDirector instance.
  25 will install the default handlers.  accepts one or more Handlers as
  26 arguments, either instances or Handler classes that it will
  27 instantiate.  if one of the argument is a subclass of the default
  28 handler, the argument will be installed instead of the default.
  29
  30 install_opener -- installs a new opener as the default opener.
  31
  32 objects of interest:
  33 OpenerDirector --
  34
  35 Request -- an object that encapsulates the state of a request.  the
  36 state can be a simple as the URL.  it can also include extra HTTP
  37 headers, e.g. a User-Agent.
  38
  39 BaseHandler --
  40
  41 exceptions:
  42 URLError-- a subclass of IOError, individual protocols have their own
  43 specific subclass
  44
  45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
  46 as an exceptional event or valid response
  47
  48 internals:
  49 BaseHandler and parent
  50 _call_chain conventions
  51
  52 Example usage:
  53
  54 import urllib2
  55
  56 # set up authentication info
  57 authinfo = urllib2.HTTPBasicAuthHandler()
  58 authinfo.add_password(realm='PDQ Application',
  59                       uri='https://mahler:8092/site-updates.py',
  60                       user='klem',
  61                       passwd='geheim$parole')
  62
  63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
  64
  65 # build a new opener that adds authentication and caching FTP handlers
  66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
  67
  68 # install it
  69 urllib2.install_opener(opener)
  70
  71 f = urllib2.urlopen('http://www.python.org/')
  72
  73
  74 """
  75
  76 # XXX issues:
  77 # If an authentication error handler that tries to perform
  78 # authentication for some reason but fails, how should the error be
  79 # signalled?  The client needs to know the HTTP error code.  But if
  80 # the handler knows that the problem was, e.g., that it didn't know
  81 # that hash algo that requested in the challenge, it would be good to
  82 # pass that information along to the client, too.
  83 # ftp errors aren't handled cleanly
  84 # check digest against correct (i.e. non-apache) implementation
  85
  86 # Possible extensions:
  87 # complex proxies  XXX not sure what exactly was meant by this
  88 # abstract factory for opener
  89
  90 import base64
  91 import hashlib
  92 import httplib
  93 import mimetools
  94 import os
  95 import posixpath
  96 import random
  97 import re
  98 import socket
  99 import sys
 100 import time
 101 import urlparse
 102 import bisect
 103
 104 try:
 105     from cStringIO import StringIO
 106 except ImportError:
 107     from StringIO import StringIO
 108
 109 from urllib import (unwrap, unquote, splittype, splithost, quote,
 110      addinfourl, splitport, splitgophertype, splitquery,
 111      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
 112
 113 # support for FileHandler, proxies via environment variables
 114 from urllib import localhost, url2pathname, getproxies
 115
 116 # used in User-Agent header sent
 117 __version__ = sys.version[:3]
 118
 119 _opener = None
 120 def urlopen(url, data=None):
 121     global _opener
 122     if _opener is None:
 123         _opener = build_opener()
 124     return _opener.open(url, data)
 125
 126 def install_opener(opener):
 127     global _opener
 128     _opener = opener
 129
 130 # do these error classes make sense?
 131 # make sure all of the IOError stuff is overridden.  we just want to be
 132 # subtypes.
 133
 134 class URLError(IOError):
 135     # URLError is a sub-type of IOError, but it doesn't share any of
 136     # the implementation.  need to override __init__ and __str__.
 137     # It sets self.args for compatibility with other EnvironmentError
 138     # subclasses, but args doesn't have the typical format with errno in
 139     # slot 0 and strerror in slot 1.  This may be better than nothing.
 140     def __init__(self, reason):
 141         self.args = reason,
 142         self.reason = reason
 143
 144     def __str__(self):
 145         return '<urlopen error %s>' % self.reason
 146
 147 class HTTPError(URLError, addinfourl):
 148     """Raised when HTTP error occurs, but also acts like non-error return"""
 149     __super_init = addinfourl.__init__
 150
 151     def __init__(self, url, code, msg, hdrs, fp):
 152         self.code = code
 153         self.msg = msg
 154         self.hdrs = hdrs
 155         self.fp = fp
 156         self.filename = url
 157         # The addinfourl classes depend on fp being a valid file
 158         # object.  In some cases, the HTTPError may not have a valid
 159         # file object.  If this happens, the simplest workaround is to
 160         # not initialize the base classes.
 161         if fp is not None:
 162             self.__super_init(fp, hdrs, url)
 163
 164     def __str__(self):
 165         return 'HTTP Error %s: %s' % (self.code, self.msg)
 166
 167 class GopherError(URLError):
 168     pass
 169
 170 # copied from cookielib.py
 171 _cut_port_re = re.compile(r":\d+$")
 172 def request_host(request):
 173     """Return request-host, as defined by RFC 2965.
 174
 175     Variation from RFC: returned value is lowercased, for convenient
 176     comparison.
 177
 178     """
 179     url = request.get_full_url()
 180     host = urlparse.urlparse(url)[1]
 181     if host == "":
 182         host = request.get_header("Host", "")
 183
 184     # remove port, if present
 185     host = _cut_port_re.sub("", host, 1)
 186     return host.lower()
 187
 188 class Request:
 189
 190     def __init__(self, url, data=None, headers={},
 191                  origin_req_host=None, unverifiable=False):
 192         # unwrap('<URL:type://host/path>') --> 'type://host/path'
 193         self.__original = unwrap(url)
 194         self.type = None
 195         # self.__r_type is what's left after doing the splittype
 196         self.host = None
 197         self.port = None
 198         self.data = data
 199         self.headers = {}
 200         for key, value in headers.items():
 201             self.add_header(key, value)
 202         self.unredirected_hdrs = {}
 203         if origin_req_host is None:
 204             origin_req_host = request_host(self)
 205         self.origin_req_host = origin_req_host
 206         self.unverifiable = unverifiable
 207
 208     def __getattr__(self, attr):
 209         # XXX this is a fallback mechanism to guard against these
 210         # methods getting called in a non-standard order.  this may be
 211         # too complicated and/or unnecessary.
 212         # XXX should the __r_XXX attributes be public?
 213         if attr[:12] == '_Request__r_':
 214             name = attr[12:]
 215             if hasattr(Request, 'get_' + name):
 216                 getattr(self, 'get_' + name)()
 217                 return getattr(self, attr)
 218         raise AttributeError, attr
 219
 220     def get_method(self):
 221         if self.has_data():
 222             return "POST"
 223         else:
 224             return "GET"
 225
 226     # XXX these helper methods are lame
 227
 228     def add_data(self, data):
 229         self.data = data
 230
 231     def has_data(self):
 232         return self.data is not None
 233
 234     def get_data(self):
 235         return self.data
 236
 237     def get_full_url(self):
 238         return self.__original
 239
 240     def get_type(self):
 241         if self.type is None:
 242             self.type, self.__r_type = splittype(self.__original)
 243             if self.type is None:
 244                 raise ValueError, "unknown url type: %s" % self.__original
 245         return self.type
 246
 247     def get_host(self):
 248         if self.host is None:
 249             self.host, self.__r_host = splithost(self.__r_type)
 250             if self.host:
 251                 self.host = unquote(self.host)
 252         return self.host
 253
 254     def get_selector(self):
 255         return self.__r_host
 256
 257     def set_proxy(self, host, type):
 258         self.host, self.type = host, type
 259         self.__r_host = self.__original
 260
 261     def get_origin_req_host(self):
 262         return self.origin_req_host
 263
 264     def is_unverifiable(self):
 265         return self.unverifiable
 266
 267     def add_header(self, key, val):
 268         # useful for something like authentication
 269         self.headers[key.capitalize()] = val
 270
 271     def add_unredirected_header(self, key, val):
 272         # will not be added to a redirected request
 273         self.unredirected_hdrs[key.capitalize()] = val
 274
 275     def has_header(self, header_name):
 276         return (header_name in self.headers or
 277                 header_name in self.unredirected_hdrs)
 278
 279     def get_header(self, header_name, default=None):
 280         return self.headers.get(
 281             header_name,
 282             self.unredirected_hdrs.get(header_name, default))
 283
 284     def header_items(self):
 285         hdrs = self.unredirected_hdrs.copy()
 286         hdrs.update(self.headers)
 287         return hdrs.items()
 288
 289 class OpenerDirector:
 290     def __init__(self):
 291         client_version = "Python-urllib/%s" % __version__
 292         self.addheaders = [('User-agent', client_version)]
 293         # manage the individual handlers
 294         self.handlers = []
 295         self.handle_open = {}
 296         self.handle_error = {}
 297         self.process_response = {}
 298         self.process_request = {}
 299
 300     def add_handler(self, handler):
 301         if not hasattr(handler, "add_parent"):
 302             raise TypeError("expected BaseHandler instance, got %r" %
 303                             type(handler))
 304
 305         added = False
 306         for meth in dir(handler):
 307             if meth in ["redirect_request", "do_open", "proxy_open"]:
 308                 # oops, coincidental match
 309                 continue
 310
 311             i = meth.find("_")
 312             protocol = meth[:i]
 313             condition = meth[i+1:]
 314
 315             if condition.startswith("error"):
 316                 j = condition.find("_") + i + 1
 317                 kind = meth[j+1:]
 318                 try:
 319                     kind = int(kind)
 320                 except ValueError:
 321                     pass
 322                 lookup = self.handle_error.get(protocol, {})
 323                 self.handle_error[protocol] = lookup
 324             elif condition == "open":
 325                 kind = protocol
 326                 lookup = self.handle_open
 327             elif condition == "response":
 328                 kind = protocol
 329                 lookup = self.process_response
 330             elif condition == "request":
 331                 kind = protocol
 332                 lookup = self.process_request
 333             else:
 334                 continue
 335
 336             handlers = lookup.setdefault(kind, [])
 337             if handlers:
 338                 bisect.insort(handlers, handler)
 339             else:
 340                 handlers.append(handler)
 341             added = True
 342
 343         if added:
 344             # XXX why does self.handlers need to be sorted?
 345             bisect.insort(self.handlers, handler)
 346             handler.add_parent(self)
 347
 348     def close(self):
 349         # Only exists for backwards compatibility.
 350         pass
 351
 352     def _call_chain(self, chain, kind, meth_name, *args):
 353         # Handlers raise an exception if no one else should try to handle
 354         # the request, or return None if they can't but another handler
 355         # could.  Otherwise, they return the response.
 356         handlers = chain.get(kind, ())
 357         for handler in handlers:
 358             func = getattr(handler, meth_name)
 359
 360             result = func(*args)
 361             if result is not None:
 362                 return result
 363
 364     def open(self, fullurl, data=None):
 365         # accept a URL or a Request object
 366         if isinstance(fullurl, basestring):
 367             req = Request(fullurl, data)
 368         else:
 369             req = fullurl
 370             if data is not None:
 371                 req.add_data(data)
 372
 373         protocol = req.get_type()
 374
 375         # pre-process request
 376         meth_name = protocol+"_request"
 377         for processor in self.process_request.get(protocol, []):
 378             meth = getattr(processor, meth_name)
 379             req = meth(req)
 380
 381         response = self._open(req, data)
 382
 383         # post-process response
 384         meth_name = protocol+"_response"
 385         for processor in self.process_response.get(protocol, []):
 386             meth = getattr(processor, meth_name)
 387             response = meth(req, response)
 388
 389         return response
 390
 391     def _open(self, req, data=None):
 392         result = self._call_chain(self.handle_open, 'default',
 393                                   'default_open', req)
 394         if result:
 395             return result
 396
 397         protocol = req.get_type()
 398         result = self._call_chain(self.handle_open, protocol, protocol +
 399                                   '_open', req)
 400         if result:
 401             return result
 402
 403         return self._call_chain(self.handle_open, 'unknown',
 404                                 'unknown_open', req)
 405
 406     def error(self, proto, *args):
 407         if proto in ('http', 'https'):
 408             # XXX http[s] protocols are special-cased
 409             dict = self.handle_error['http'] # https is not different than http
 410             proto = args[2]  # YUCK!
 411             meth_name = 'http_error_%s' % proto
 412             http_err = 1
 413             orig_args = args
 414         else:
 415             dict = self.handle_error
 416             meth_name = proto + '_error'
 417             http_err = 0
 418         args = (dict, proto, meth_name) + args
 419         result = self._call_chain(*args)
 420         if result:
 421             return result
 422
 423         if http_err:
 424             args = (dict, 'default', 'http_error_default') + orig_args
 425             return self._call_chain(*args)
 426
 427 # XXX probably also want an abstract factory that knows when it makes
 428 # sense to skip a superclass in favor of a subclass and when it might
 429 # make sense to include both
 430
 431 def build_opener(*handlers):
 432     """Create an opener object from a list of handlers.
 433
 434     The opener will use several default handlers, including support
 435     for HTTP and FTP.
 436
 437     If any of the handlers passed as arguments are subclasses of the
 438     default handlers, the default handlers will not be used.
 439     """
 440     import types
 441     def isclass(obj):
 442         return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
 443
 444     opener = OpenerDirector()
 445     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
 446                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
 447                        FTPHandler, FileHandler, HTTPErrorProcessor]
 448     if hasattr(httplib, 'HTTPS'):
 449         default_classes.append(HTTPSHandler)
 450     skip = []
 451     for klass in default_classes:
 452         for check in handlers:
 453             if isclass(check):
 454                 if issubclass(check, klass):
 455                     skip.append(klass)
 456             elif isinstance(check, klass):
 457                 skip.append(klass)
 458     for klass in skip:
 459         default_classes.remove(klass)
 460
 461     for klass in default_classes:
 462         opener.add_handler(klass())
 463
 464     for h in handlers:
 465         if isclass(h):
 466             h = h()
 467         opener.add_handler(h)
 468     return opener
 469
 470 class BaseHandler:
 471     handler_order = 500
 472
 473     def add_parent(self, parent):
 474         self.parent = parent
 475
 476     def close(self):
 477         # Only exists for backwards compatibility
 478         pass
 479
 480     def __lt__(self, other):
 481         if not hasattr(other, "handler_order"):
 482             # Try to preserve the old behavior of having custom classes
 483             # inserted after default ones (works only for custom user
 484             # classes which are not aware of handler_order).
 485             return True
 486         return self.handler_order < other.handler_order
 487
 488
 489 class HTTPErrorProcessor(BaseHandler):
 490     """Process HTTP error responses."""
 491     handler_order = 1000  # after all other processing
 492
 493     def http_response(self, request, response):
 494         code, msg, hdrs = response.code, response.msg, response.info()
 495
 496         if code not in (200, 206):
 497             response = self.parent.error(
 498                 'http', request, response, code, msg, hdrs)
 499
 500         return response
 501
 502     https_response = http_response
 503
 504 class HTTPDefaultErrorHandler(BaseHandler):
 505     def http_error_default(self, req, fp, code, msg, hdrs):
 506         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
 507
 508 class HTTPRedirectHandler(BaseHandler):
 509     # maximum number of redirections to any single URL
 510     # this is needed because of the state that cookies introduce
 511     max_repeats = 4
 512     # maximum total number of redirections (regardless of URL) before
 513     # assuming we're in a loop
 514     max_redirections = 10
 515
 516     def redirect_request(self, req, fp, code, msg, headers, newurl):
 517         """Return a Request or None in response to a redirect.
 518
 519         This is called by the http_error_30x methods when a
 520         redirection response is received.  If a redirection should
 521         take place, return a new Request to allow http_error_30x to
 522         perform the redirect.  Otherwise, raise HTTPError if no-one
 523         else should try to handle this url.  Return None if you can't
 524         but another Handler might.
 525         """
 526         m = req.get_method()
 527         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
 528             or code in (301, 302, 303) and m == "POST"):
 529             # Strictly (according to RFC 2616), 301 or 302 in response
 530             # to a POST MUST NOT cause a redirection without confirmation
 531             # from the user (of urllib2, in this case).  In practice,
 532             # essentially all clients do redirect in this case, so we
 533             # do the same.
 534             # be conciliant with URIs containing a space
 535             newurl = newurl.replace(' ', '%20')
 536             return Request(newurl,
 537                            headers=req.headers,
 538                            origin_req_host=req.get_origin_req_host(),
 539                            unverifiable=True)
 540         else:
 541             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
 542
 543     # Implementation note: To avoid the server sending us into an
 544     # infinite loop, the request object needs to track what URLs we
 545     # have already seen.  Do this by adding a handler-specific
 546     # attribute to the Request object.
 547     def http_error_302(self, req, fp, code, msg, headers):
 548         # Some servers (incorrectly) return multiple Location headers
 549         # (so probably same goes for URI).  Use first header.
 550         if 'location' in headers:
 551             newurl = headers.getheaders('location')[0]
 552         elif 'uri' in headers:
 553             newurl = headers.getheaders('uri')[0]
 554         else:
 555             return
 556         newurl = urlparse.urljoin(req.get_full_url(), newurl)
 557
 558         # XXX Probably want to forget about the state of the current
 559         # request, although that might interact poorly with other
 560         # handlers that also use handler-specific request attributes
 561         new = self.redirect_request(req, fp, code, msg, headers, newurl)
 562         if new is None:
 563             return
 564
 565         # loop detection
 566         # .redirect_dict has a key url if url was previously visited.
 567         if hasattr(req, 'redirect_dict'):
 568             visited = new.redirect_dict = req.redirect_dict
 569             if (visited.get(newurl, 0) >= self.max_repeats or
 570                 len(visited) >= self.max_redirections):
 571                 raise HTTPError(req.get_full_url(), code,
 572                                 self.inf_msg + msg, headers, fp)
 573         else:
 574             visited = new.redirect_dict = req.redirect_dict = {}
 575         visited[newurl] = visited.get(newurl, 0) + 1
 576
 577         # Don't close the fp until we are sure that we won't use it
 578         # with HTTPError.
 579         fp.read()
 580         fp.close()
 581
 582         return self.parent.open(new)
 583
 584     http_error_301 = http_error_303 = http_error_307 = http_error_302
 585
 586     inf_msg = "The HTTP server returned a redirect error that would " \
 587               "lead to an infinite loop.\n" \
 588               "The last 30x error message was:\n"
 589
 590
 591 def _parse_proxy(proxy):
 592     """Return (scheme, user, password, host/port) given a URL or an authority.
 593
 594     If a URL is supplied, it must have an authority (host:port) component.
 595     According to RFC 3986, having an authority component means the URL must
 596     have two slashes after the scheme:
 597
 598     >>> _parse_proxy('file:/ftp.example.com/')
 599     Traceback (most recent call last):
 600     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
 601
 602     The first three items of the returned tuple may be None.
 603
 604     Examples of authority parsing:
 605
 606     >>> _parse_proxy('proxy.example.com')
 607     (None, None, None, 'proxy.example.com')
 608     >>> _parse_proxy('proxy.example.com:3128')
 609     (None, None, None, 'proxy.example.com:3128')
 610
 611     The authority component may optionally include userinfo (assumed to be
 612     username:password):
 613
 614     >>> _parse_proxy('joe:password@proxy.example.com')
 615     (None, 'joe', 'password', 'proxy.example.com')
 616     >>> _parse_proxy('joe:password@proxy.example.com:3128')
 617     (None, 'joe', 'password', 'proxy.example.com:3128')
 618
 619     Same examples, but with URLs instead:
 620
 621     >>> _parse_proxy('http://proxy.example.com/')
 622     ('http', None, None, 'proxy.example.com')
 623     >>> _parse_proxy('http://proxy.example.com:3128/')
 624     ('http', None, None, 'proxy.example.com:3128')
 625     >>> _parse_proxy('http://joe:password@proxy.example.com/')
 626     ('http', 'joe', 'password', 'proxy.example.com')
 627     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
 628     ('http', 'joe', 'password', 'proxy.example.com:3128')
 629
 630     Everything after the authority is ignored:
 631
 632     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
 633     ('ftp', 'joe', 'password', 'proxy.example.com')
 634
 635     Test for no trailing '/' case:
 636
 637     >>> _parse_proxy('http://joe:password@proxy.example.com')
 638     ('http', 'joe', 'password', 'proxy.example.com')
 639
 640     """
 641     scheme, r_scheme = splittype(proxy)
 642     if not r_scheme.startswith("/"):
 643         # authority
 644         scheme = None
 645         authority = proxy
 646     else:
 647         # URL
 648         if not r_scheme.startswith("//"):
 649             raise ValueError("proxy URL with no authority: %r" % proxy)
 650         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
 651         # and 3.3.), path is empty or starts with '/'
 652         end = r_scheme.find("/", 2)
 653         if end == -1:
 654             end = None
 655         authority = r_scheme[2:end]
 656     userinfo, hostport = splituser(authority)
 657     if userinfo is not None:
 658         user, password = splitpasswd(userinfo)
 659     else:
 660         user = password = None
 661     return scheme, user, password, hostport
 662
 663 class ProxyHandler(BaseHandler):
 664     # Proxies must be in front
 665     handler_order = 100
 666
 667     def __init__(self, proxies=None):
 668         if proxies is None:
 669             proxies = getproxies()
 670         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 671         self.proxies = proxies
 672         for type, url in proxies.items():
 673             setattr(self, '%s_open' % type,
 674                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
 675                     meth(r, proxy, type))
 676
 677     def proxy_open(self, req, proxy, type):
 678         orig_type = req.get_type()
 679         proxy_type, user, password, hostport = _parse_proxy(proxy)
 680         if proxy_type is None:
 681             proxy_type = orig_type
 682         if user and password:
 683             user_pass = '%s:%s' % (unquote(user), unquote(password))
 684             creds = base64.b64encode(user_pass).strip()
 685             req.add_header('Proxy-authorization', 'Basic ' + creds)
 686         hostport = unquote(hostport)
 687         req.set_proxy(hostport, proxy_type)
 688         if orig_type == proxy_type:
 689             # let other handlers take care of it
 690             return None
 691         else:
 692             # need to start over, because the other handlers don't
 693             # grok the proxy's URL type
 694             # e.g. if we have a constructor arg proxies like so:
 695             # {'http': 'ftp://proxy.example.com'}, we may end up turning
 696             # a request for http://acme.example.com/a into one for
 697             # ftp://proxy.example.com/a
 698             return self.parent.open(req)
 699
 700 class HTTPPasswordMgr:
 701
 702     def __init__(self):
 703         self.passwd = {}
 704
 705     def add_password(self, realm, uri, user, passwd):
 706         # uri could be a single URI or a sequence
 707         if isinstance(uri, basestring):
 708             uri = [uri]
 709         if not realm in self.passwd:
 710             self.passwd[realm] = {}
 711         for default_port in True, False:
 712             reduced_uri = tuple(
 713                 [self.reduce_uri(u, default_port) for u in uri])
 714             self.passwd[realm][reduced_uri] = (user, passwd)
 715
 716     def find_user_password(self, realm, authuri):
 717         domains = self.passwd.get(realm, {})
 718         for default_port in True, False:
 719             reduced_authuri = self.reduce_uri(authuri, default_port)
 720             for uris, authinfo in domains.iteritems():
 721                 for uri in uris:
 722                     if self.is_suburi(uri, reduced_authuri):
 723                         return authinfo
 724         return None, None
 725
 726     def reduce_uri(self, uri, default_port=True):
 727         """Accept authority or URI and extract only the authority and path."""
 728         # note HTTP URLs do not have a userinfo component
 729         parts = urlparse.urlsplit(uri)
 730         if parts[1]:
 731             # URI
 732             scheme = parts[0]
 733             authority = parts[1]
 734             path = parts[2] or '/'
 735         else:
 736             # host or host:port
 737             scheme = None
 738             authority = uri
 739             path = '/'
 740         host, port = splitport(authority)
 741         if default_port and port is None and scheme is not None:
 742             dport = {"http": 80,
 743                      "https": 443,
 744                      }.get(scheme)
 745             if dport is not None:
 746                 authority = "%s:%d" % (host, dport)
 747         return authority, path
 748
 749     def is_suburi(self, base, test):
 750         """Check if test is below base in a URI tree
 751
 752         Both args must be URIs in reduced form.
 753         """
 754         if base == test:
 755             return True
 756         if base[0] != test[0]:
 757             return False
 758         common = posixpath.commonprefix((base[1], test[1]))
 759         if len(common) == len(base[1]):
 760             return True
 761         return False
 762
 763
 764 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
 765
 766     def find_user_password(self, realm, authuri):
 767         user, password = HTTPPasswordMgr.find_user_password(self, realm,
 768                                                             authuri)
 769         if user is not None:
 770             return user, password
 771         return HTTPPasswordMgr.find_user_password(self, None, authuri)
 772
 773
 774 class AbstractBasicAuthHandler:
 775
 776     # XXX this allows for multiple auth-schemes, but will stupidly pick
 777     # the last one with a realm specified.
 778
 779     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
 780
 781     # XXX could pre-emptively send auth info already accepted (RFC 2617,
 782     # end of section 2, and section 1.2 immediately after "credentials"
 783     # production).
 784
 785     def __init__(self, password_mgr=None):
 786         if password_mgr is None:
 787             password_mgr = HTTPPasswordMgr()
 788         self.passwd = password_mgr
 789         self.add_password = self.passwd.add_password
 790
 791     def http_error_auth_reqed(self, authreq, host, req, headers):
 792         # host may be an authority (without userinfo) or a URL with an
 793         # authority
 794         # XXX could be multiple headers
 795         authreq = headers.get(authreq, None)
 796         if authreq:
 797             mo = AbstractBasicAuthHandler.rx.search(authreq)
 798             if mo:
 799                 scheme, realm = mo.groups()
 800                 if scheme.lower() == 'basic':
 801                     return self.retry_http_basic_auth(host, req, realm)
 802
 803     def retry_http_basic_auth(self, host, req, realm):
 804         user, pw = self.passwd.find_user_password(realm, host)
 805         if pw is not None:
 806             raw = "%s:%s" % (user, pw)
 807             auth = 'Basic %s' % base64.b64encode(raw).strip()
 808             if req.headers.get(self.auth_header, None) == auth:
 809                 return None
 810             req.add_header(self.auth_header, auth)
 811             return self.parent.open(req)
 812         else:
 813             return None
 814
 815
 816 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 817
 818     auth_header = 'Authorization'
 819
 820     def http_error_401(self, req, fp, code, msg, headers):
 821         url = req.get_full_url()
 822         return self.http_error_auth_reqed('www-authenticate',
 823                                           url, req, headers)
 824
 825
 826 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
 827
 828     auth_header = 'Proxy-authorization'
 829
 830     def http_error_407(self, req, fp, code, msg, headers):
 831         # http_error_auth_reqed requires that there is no userinfo component in
 832         # authority.  Assume there isn't one, since urllib2 does not (and
 833         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
 834         # userinfo.
 835         authority = req.get_host()
 836         return self.http_error_auth_reqed('proxy-authenticate',
 837                                           authority, req, headers)
 838
 839
 840 def randombytes(n):
 841     """Return n random bytes."""
 842     # Use /dev/urandom if it is available.  Fall back to random module
 843     # if not.  It might be worthwhile to extend this function to use
 844     # other platform-specific mechanisms for getting random bytes.
 845     if os.path.exists("/dev/urandom"):
 846         f = open("/dev/urandom")
 847         s = f.read(n)
 848         f.close()
 849         return s
 850     else:
 851         L = [chr(random.randrange(0, 256)) for i in range(n)]
 852         return "".join(L)
 853
 854 class AbstractDigestAuthHandler:
 855     # Digest authentication is specified in RFC 2617.
 856
 857     # XXX The client does not inspect the Authentication-Info header
 858     # in a successful response.
 859
 860     # XXX It should be possible to test this implementation against
 861     # a mock server that just generates a static set of challenges.
 862
 863     # XXX qop="auth-int" supports is shaky
 864
 865     def __init__(self, passwd=None):
 866         if passwd is None:
 867             passwd = HTTPPasswordMgr()
 868         self.passwd = passwd
 869         self.add_password = self.passwd.add_password
 870         self.retried = 0
 871         self.nonce_count = 0
 872
 873     def reset_retry_count(self):
 874         self.retried = 0
 875
 876     def http_error_auth_reqed(self, auth_header, host, req, headers):
 877         authreq = headers.get(auth_header, None)
 878         if self.retried > 5:
 879             # Don't fail endlessly - if we failed once, we'll probably
 880             # fail a second time. Hm. Unless the Password Manager is
 881             # prompting for the information. Crap. This isn't great
 882             # but it's better than the current 'repeat until recursion
 883             # depth exceeded' approach <wink>
 884             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
 885                             headers, None)
 886         else:
 887             self.retried += 1
 888         if authreq:
 889             scheme = authreq.split()[0]
 890             if scheme.lower() == 'digest':
 891                 return self.retry_http_digest_auth(req, authreq)
 892
 893     def retry_http_digest_auth(self, req, auth):
 894         token, challenge = auth.split(' ', 1)
 895         chal = parse_keqv_list(parse_http_list(challenge))
 896         auth = self.get_authorization(req, chal)
 897         if auth:
 898             auth_val = 'Digest %s' % auth
 899             if req.headers.get(self.auth_header, None) == auth_val:
 900                 return None
 901             req.add_unredirected_header(self.auth_header, auth_val)
 902             resp = self.parent.open(req)
 903             return resp
 904
 905     def get_cnonce(self, nonce):
 906         # The cnonce-value is an opaque
 907         # quoted string value provided by the client and used by both client
 908         # and server to avoid chosen plaintext attacks, to provide mutual
 909         # authentication, and to provide some message integrity protection.
 910         # This isn't a fabulous effort, but it's probably Good Enough.
 911         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
 912                                             randombytes(8))).hexdigest()
 913         return dig[:16]
 914
 915     def get_authorization(self, req, chal):
 916         try:
 917             realm = chal['realm']
 918             nonce = chal['nonce']
 919             qop = chal.get('qop')
 920             algorithm = chal.get('algorithm', 'MD5')
 921             # mod_digest doesn't send an opaque, even though it isn't
 922             # supposed to be optional
 923             opaque = chal.get('opaque', None)
 924         except KeyError:
 925             return None
 926
 927         H, KD = self.get_algorithm_impls(algorithm)
 928         if H is None:
 929             return None
 930
 931         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
 932         if user is None:
 933             return None
 934
 935         # XXX not implemented yet
 936         if req.has_data():
 937             entdig = self.get_entity_digest(req.get_data(), chal)
 938         else:
 939             entdig = None
 940
 941         A1 = "%s:%s:%s" % (user, realm, pw)
 942         A2 = "%s:%s" % (req.get_method(),
 943                         # XXX selector: what about proxies and full urls
 944                         req.get_selector())
 945         if qop == 'auth':
 946             self.nonce_count += 1
 947             ncvalue = '%08x' % self.nonce_count
 948             cnonce = self.get_cnonce(nonce)
 949             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
 950             respdig = KD(H(A1), noncebit)
 951         elif qop is None:
 952             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
 953         else:
 954             # XXX handle auth-int.
 955             raise URLError("qop '%s' is not supported." % qop)
 956
 957         # XXX should the partial digests be encoded too?
 958
 959         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
 960                'response="%s"' % (user, realm, nonce, req.get_selector(),
 961                                   respdig)
 962         if opaque:
 963             base += ', opaque="%s"' % opaque
 964         if entdig:
 965             base += ', digest="%s"' % entdig
 966         base += ', algorithm="%s"' % algorithm
 967         if qop:
 968             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
 969         return base
 970
 971     def get_algorithm_impls(self, algorithm):
 972         # lambdas assume digest modules are imported at the top level
 973         if algorithm == 'MD5':
 974             H = lambda x: hashlib.md5(x).hexdigest()
 975         elif algorithm == 'SHA':
 976             H = lambda x: hashlib.sha1(x).hexdigest()
 977         # XXX MD5-sess
 978         KD = lambda s, d: H("%s:%s" % (s, d))
 979         return H, KD
 980
 981     def get_entity_digest(self, data, chal):
 982         # XXX not implemented yet
 983         return None
 984
 985
 986 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
 987     """An authentication protocol defined by RFC 2069
 988
 989     Digest authentication improves on basic authentication because it
 990     does not transmit passwords in the clear.
 991     """
 992
 993     auth_header = 'Authorization'
 994     handler_order = 490  # before Basic auth
 995
 996     def http_error_401(self, req, fp, code, msg, headers):
 997         host = urlparse.urlparse(req.get_full_url())[1]
 998         retry = self.http_error_auth_reqed('www-authenticate',
 999                                            host, req, headers)
1000         self.reset_retry_count()
1001         return retry
1002
1003
1004 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
1005
1006     auth_header = 'Proxy-Authorization'
1007     handler_order = 490  # before Basic auth
1008
1009     def http_error_407(self, req, fp, code, msg, headers):
1010         host = req.get_host()
1011         retry = self.http_error_auth_reqed('proxy-authenticate',
1012                                            host, req, headers)
1013         self.reset_retry_count()
1014         return retry
1015
1016 class AbstractHTTPHandler(BaseHandler):
1017
1018     def __init__(self, debuglevel=0):
1019         self._debuglevel = debuglevel
1020
1021     def set_http_debuglevel(self, level):
1022         self._debuglevel = level
1023
1024     def do_request_(self, request):
1025         host = request.get_host()
1026         if not host:
1027             raise URLError('no host given')
1028
1029         if request.has_data():  # POST
1030             data = request.get_data()
1031             if not request.has_header('Content-type'):
1032                 request.add_unredirected_header(
1033                     'Content-type',
1034                     'application/x-www-form-urlencoded')
1035             if not request.has_header('Content-length'):
1036                 request.add_unredirected_header(
1037                     'Content-length', '%d' % len(data))
1038
1039         scheme, sel = splittype(request.get_selector())
1040         sel_host, sel_path = splithost(sel)
1041         if not request.has_header('Host'):
1042             request.add_unredirected_header('Host', sel_host or host)
1043         for name, value in self.parent.addheaders:
1044             name = name.capitalize()
1045             if not request.has_header(name):
1046                 request.add_unredirected_header(name, value)
1047
1048         return request
1049
1050     def do_open(self, http_class, req):
1051         """Return an addinfourl object for the request, using http_class.
1052
1053         http_class must implement the HTTPConnection API from httplib.
1054         The addinfourl return value is a file-like object.  It also
1055         has methods and attributes including:
1056             - info(): return a mimetools.Message object for the headers
1057             - geturl(): return the original request URL
1058             - code: HTTP status code
1059         """
1060         host = req.get_host()
1061         if not host:
1062             raise URLError('no host given')
1063
1064         h = http_class(host) # will parse host:port
1065         h.set_debuglevel(self._debuglevel)
1066
1067         headers = dict(req.headers)
1068         headers.update(req.unredirected_hdrs)
1069         # We want to make an HTTP/1.1 request, but the addinfourl
1070         # class isn't prepared to deal with a persistent connection.
1071         # It will try to read all remaining data from the socket,
1072         # which will block while the server waits for the next request.
1073         # So make sure the connection gets closed after the (only)
1074         # request.
1075         headers["Connection"] = "close"
1076         headers = dict(
1077             (name.title(), val) for name, val in headers.items())
1078         try:
1079             h.request(req.get_method(), req.get_selector(), req.data, headers)
1080             r = h.getresponse()
1081         except socket.error, err: # XXX what error?
1082             raise URLError(err)
1083
1084         # Pick apart the HTTPResponse object to get the addinfourl
1085         # object initialized properly.
1086
1087         # Wrap the HTTPResponse object in socket's file object adapter
1088         # for Windows.  That adapter calls recv(), so delegate recv()
1089         # to read().  This weird wrapping allows the returned object to
1090         # have readline() and readlines() methods.
1091
1092         # XXX It might be better to extract the read buffering code
1093         # out of socket._fileobject() and into a base class.
1094
1095         r.recv = r.read
1096         fp = socket._fileobject(r, close=True)
1097
1098         resp = addinfourl(fp, r.msg, req.get_full_url())
1099         resp.code = r.status
1100         resp.msg = r.reason
1101         return resp
1102
1103
1104 class HTTPHandler(AbstractHTTPHandler):
1105
1106     def http_open(self, req):
1107         return self.do_open(httplib.HTTPConnection, req)
1108
1109     http_request = AbstractHTTPHandler.do_request_
1110
1111 if hasattr(httplib, 'HTTPS'):
1112     class HTTPSHandler(AbstractHTTPHandler):
1113
1114         def https_open(self, req):
1115             return self.do_open(httplib.HTTPSConnection, req)
1116
1117         https_request = AbstractHTTPHandler.do_request_
1118
1119 class HTTPCookieProcessor(BaseHandler):
1120     def __init__(self, cookiejar=None):
1121         import cookielib
1122         if cookiejar is None:
1123             cookiejar = cookielib.CookieJar()
1124         self.cookiejar = cookiejar
1125
1126     def http_request(self, request):
1127         self.cookiejar.add_cookie_header(request)
1128         return request
1129
1130     def http_response(self, request, response):
1131         self.cookiejar.extract_cookies(response, request)
1132         return response
1133
1134     https_request = http_request
1135     https_response = http_response
1136
1137 class UnknownHandler(BaseHandler):
1138     def unknown_open(self, req):
1139         type = req.get_type()
1140         raise URLError('unknown url type: %s' % type)
1141
1142 def parse_keqv_list(l):
1143     """Parse list of key=value strings where keys are not duplicated."""
1144     parsed = {}
1145     for elt in l:
1146         k, v = elt.split('=', 1)
1147         if v[0] == '"' and v[-1] == '"':
1148             v = v[1:-1]
1149         parsed[k] = v
1150     return parsed
1151
1152 def parse_http_list(s):
1153     """Parse lists as described by RFC 2068 Section 2.
1154
1155     In particular, parse comma-separated lists where the elements of
1156     the list may include quoted-strings.  A quoted-string could
1157     contain a comma.  A non-quoted string could have quotes in the
1158     middle.  Neither commas nor quotes count if they are escaped.
1159     Only double-quotes count, not single-quotes.
1160     """
1161     res = []
1162     part = ''
1163
1164     escape = quote = False
1165     for cur in s:
1166         if escape:
1167             part += cur
1168             escape = False
1169             continue
1170         if quote:
1171             if cur == '\\':
1172                 escape = True
1173                 continue
1174             elif cur == '"':
1175                 quote = False
1176             part += cur
1177             continue
1178
1179         if cur == ',':
1180             res.append(part)
1181             part = ''
1182             continue
1183
1184         if cur == '"':
1185             quote = True
1186
1187         part += cur
1188
1189     # append last part
1190     if part:
1191         res.append(part)
1192
1193     return [part.strip() for part in res]
1194
1195 class FileHandler(BaseHandler):
1196     # Use local file or FTP depending on form of URL
1197     def file_open(self, req):
1198         url = req.get_selector()
1199         if url[:2] == '//' and url[2:3] != '/':
1200             req.type = 'ftp'
1201             return self.parent.open(req)
1202         else:
1203             return self.open_local_file(req)
1204
1205     # names for the localhost
1206     names = None
1207     def get_names(self):
1208         if FileHandler.names is None:
1209             try:
1210                 FileHandler.names = (socket.gethostbyname('localhost'),
1211                                     socket.gethostbyname(socket.gethostname()))
1212             except socket.gaierror:
1213                 FileHandler.names = (socket.gethostbyname('localhost'),)
1214         return FileHandler.names
1215
1216     # not entirely sure what the rules are here
1217     def open_local_file(self, req):
1218         import email.Utils
1219         import mimetypes
1220         host = req.get_host()
1221         file = req.get_selector()
1222         localfile = url2pathname(file)
1223         stats = os.stat(localfile)
1224         size = stats.st_size
1225         modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
1226         mtype = mimetypes.guess_type(file)[0]
1227         headers = mimetools.Message(StringIO(
1228             'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1229             (mtype or 'text/plain', size, modified)))
1230         if host:
1231             host, port = splitport(host)
1232         if not host or \
1233            (not port and socket.gethostbyname(host) in self.get_names()):
1234             return addinfourl(open(localfile, 'rb'),
1235                               headers, 'file:'+file)
1236         raise URLError('file not on local host')
1237
1238 class FTPHandler(BaseHandler):
1239     def ftp_open(self, req):
1240         import ftplib
1241         import mimetypes
1242         host = req.get_host()
1243         if not host:
1244             raise IOError, ('ftp error', 'no host given')
1245         host, port = splitport(host)
1246         if port is None:
1247             port = ftplib.FTP_PORT
1248         else:
1249             port = int(port)
1250
1251         # username/password handling
1252         user, host = splituser(host)
1253         if user:
1254             user, passwd = splitpasswd(user)
1255         else:
1256             passwd = None
1257         host = unquote(host)
1258         user = unquote(user or '')
1259         passwd = unquote(passwd or '')
1260
1261         try:
1262             host = socket.gethostbyname(host)
1263         except socket.error, msg:
1264             raise URLError(msg)
1265         path, attrs = splitattr(req.get_selector())
1266         dirs = path.split('/')
1267         dirs = map(unquote, dirs)
1268         dirs, file = dirs[:-1], dirs[-1]
1269         if dirs and not dirs[0]:
1270             dirs = dirs[1:]
1271         try:
1272             fw = self.connect_ftp(user, passwd, host, port, dirs)
1273             type = file and 'I' or 'D'
1274             for attr in attrs:
1275                 attr, value = splitvalue(attr)
1276                 if attr.lower() == 'type' and \
1277                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
1278                     type = value.upper()
1279             fp, retrlen = fw.retrfile(file, type)
1280             headers = ""
1281             mtype = mimetypes.guess_type(req.get_full_url())[0]
1282             if mtype:
1283                 headers += "Content-type: %s\n" % mtype
1284             if retrlen is not None and retrlen >= 0:
1285                 headers += "Content-length: %d\n" % retrlen
1286             sf = StringIO(headers)
1287             headers = mimetools.Message(sf)
1288             return addinfourl(fp, headers, req.get_full_url())
1289         except ftplib.all_errors, msg:
1290             raise IOError, ('ftp error', msg), sys.exc_info()[2]
1291
1292     def connect_ftp(self, user, passwd, host, port, dirs):
1293         fw = ftpwrapper(user, passwd, host, port, dirs)
1294 ##        fw.ftp.set_debuglevel(1)
1295         return fw
1296
1297 class CacheFTPHandler(FTPHandler):
1298     # XXX would be nice to have pluggable cache strategies
1299     # XXX this stuff is definitely not thread safe
1300     def __init__(self):
1301         self.cache = {}
1302         self.timeout = {}
1303         self.soonest = 0
1304         self.delay = 60
1305         self.max_conns = 16
1306
1307     def setTimeout(self, t):
1308         self.delay = t
1309
1310     def setMaxConns(self, m):
1311         self.max_conns = m
1312
1313     def connect_ftp(self, user, passwd, host, port, dirs):
1314         key = user, host, port, '/'.join(dirs)
1315         if key in self.cache:
1316             self.timeout[key] = time.time() + self.delay
1317         else:
1318             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1319             self.timeout[key] = time.time() + self.delay
1320         self.check_cache()
1321         return self.cache[key]
1322
1323     def check_cache(self):
1324         # first check for old ones
1325         t = time.time()
1326         if self.soonest <= t:
1327             for k, v in self.timeout.items():
1328                 if v < t:
1329                     self.cache[k].close()
1330                     del self.cache[k]
1331                     del self.timeout[k]
1332         self.soonest = min(self.timeout.values())
1333
1334         # then check the size
1335         if len(self.cache) == self.max_conns:
1336             for k, v in self.timeout.items():
1337                 if v == self.soonest:
1338                     del self.cache[k]
1339                     del self.timeout[k]
1340                     break
1341             self.soonest = min(self.timeout.values())
1342
1343 class GopherHandler(BaseHandler):
1344     def gopher_open(self, req):
1345         # XXX can raise socket.error
1346         import gopherlib  # this raises DeprecationWarning in 2.5
1347         host = req.get_host()
1348         if not host:
1349             raise GopherError('no host given')
1350         host = unquote(host)
1351         selector = req.get_selector()
1352         type, selector = splitgophertype(selector)
1353         selector, query = splitquery(selector)
1354         selector = unquote(selector)
1355         if query:
1356             query = unquote(query)
1357             fp = gopherlib.send_query(selector, query, host)
1358         else:
1359             fp = gopherlib.send_selector(selector, host)
1360         return addinfourl(fp, noheaders(), req.get_full_url())