Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import time
  29 import sys
  30 from urlparse import urljoin as basejoin
  31 import warnings
  32
  33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  34            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  35            "urlencode", "url2pathname", "pathname2url", "splittag",
  36            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  37            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  38            "splitnport", "splitquery", "splitattr", "splitvalue",
  39            "getproxies"]
  40
  41 __version__ = '1.17'    # XXX This version is not always updated :-(
  42
  43 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  44
  45 # Helper for non-unix systems
  46 if os.name == 'mac':
  47     from macurl2path import url2pathname, pathname2url
  48 elif os.name == 'nt':
  49     from nturl2path import url2pathname, pathname2url
  50 elif os.name == 'riscos':
  51     from rourl2path import url2pathname, pathname2url
  52 else:
  53     def url2pathname(pathname):
  54         """OS-specific conversion from a relative URL of the 'file' scheme
  55         to a file system path; not recommended for general use."""
  56         return unquote(pathname)
  57
  58     def pathname2url(pathname):
  59         """OS-specific conversion from a file system path to a relative URL
  60         of the 'file' scheme; not recommended for general use."""
  61         return quote(pathname)
  62
  63 # This really consists of two pieces:
  64 # (1) a class which handles opening of all sorts of URLs
  65 #     (plus assorted utilities etc.)
  66 # (2) a set of functions for parsing URLs
  67 # XXX Should these be separated out into different modules?
  68
  69
  70 # Shortcut for basic usage
  71 _urlopener = None
  72 def urlopen(url, data=None, proxies=None):
  73     """Create a file-like object for the specified URL to read from."""
  74     from warnings import warnpy3k
  75     warnings.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
  76                         "favor of urllib2.urlopen()", stacklevel=2)
  77
  78     global _urlopener
  79     if proxies is not None:
  80         opener = FancyURLopener(proxies=proxies)
  81     elif not _urlopener:
  82         opener = FancyURLopener()
  83         _urlopener = opener
  84     else:
  85         opener = _urlopener
  86     if data is None:
  87         return opener.open(url)
  88     else:
  89         return opener.open(url, data)
  90 def urlretrieve(url, filename=None, reporthook=None, data=None):
  91     global _urlopener
  92     if not _urlopener:
  93         _urlopener = FancyURLopener()
  94     return _urlopener.retrieve(url, filename, reporthook, data)
  95 def urlcleanup():
  96     if _urlopener:
  97         _urlopener.cleanup()
  98
  99 # check for SSL
 100 try:
 101     import ssl
 102 except:
 103     _have_ssl = False
 104 else:
 105     _have_ssl = True
 106
 107 # exception raised when downloaded size does not match content-length
 108 class ContentTooShortError(IOError):
 109     def __init__(self, message, content):
 110         IOError.__init__(self, message)
 111         self.content = content
 112
 113 ftpcache = {}
 114 class URLopener:
 115     """Class to open URLs.
 116     This is a class rather than just a subroutine because we may need
 117     more than one set of global protocol-specific options.
 118     Note -- this is a base class for those who don't want the
 119     automatic handling of errors type 302 (relocated) and 401
 120     (authorization needed)."""
 121
 122     __tempfiles = None
 123
 124     version = "Python-urllib/%s" % __version__
 125
 126     # Constructor
 127     def __init__(self, proxies=None, **x509):
 128         if proxies is None:
 129             proxies = getproxies()
 130         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 131         self.proxies = proxies
 132         self.key_file = x509.get('key_file')
 133         self.cert_file = x509.get('cert_file')
 134         self.addheaders = [('User-Agent', self.version)]
 135         self.__tempfiles = []
 136         self.__unlink = os.unlink # See cleanup()
 137         self.tempcache = None
 138         # Undocumented feature: if you assign {} to tempcache,
 139         # it is used to cache files retrieved with
 140         # self.retrieve().  This is not enabled by default
 141         # since it does not work for changing documents (and I
 142         # haven't got the logic to check expiration headers
 143         # yet).
 144         self.ftpcache = ftpcache
 145         # Undocumented feature: you can use a different
 146         # ftp cache by assigning to the .ftpcache member;
 147         # in case you want logically independent URL openers
 148         # XXX This is not threadsafe.  Bah.
 149
 150     def __del__(self):
 151         self.close()
 152
 153     def close(self):
 154         self.cleanup()
 155
 156     def cleanup(self):
 157         # This code sometimes runs when the rest of this module
 158         # has already been deleted, so it can't use any globals
 159         # or import anything.
 160         if self.__tempfiles:
 161             for file in self.__tempfiles:
 162                 try:
 163                     self.__unlink(file)
 164                 except OSError:
 165                     pass
 166             del self.__tempfiles[:]
 167         if self.tempcache:
 168             self.tempcache.clear()
 169
 170     def addheader(self, *args):
 171         """Add a header to be used by the HTTP interface only
 172         e.g. u.addheader('Accept', 'sound/basic')"""
 173         self.addheaders.append(args)
 174
 175     # External interface
 176     def open(self, fullurl, data=None):
 177         """Use URLopener().open(file) instead of open(file, 'r')."""
 178         fullurl = unwrap(toBytes(fullurl))
 179         if self.tempcache and fullurl in self.tempcache:
 180             filename, headers = self.tempcache[fullurl]
 181             fp = open(filename, 'rb')
 182             return addinfourl(fp, headers, fullurl)
 183         urltype, url = splittype(fullurl)
 184         if not urltype:
 185             urltype = 'file'
 186         if urltype in self.proxies:
 187             proxy = self.proxies[urltype]
 188             urltype, proxyhost = splittype(proxy)
 189             host, selector = splithost(proxyhost)
 190             url = (host, fullurl) # Signal special case to open_*()
 191         else:
 192             proxy = None
 193         name = 'open_' + urltype
 194         self.type = urltype
 195         name = name.replace('-', '_')
 196         if not hasattr(self, name):
 197             if proxy:
 198                 return self.open_unknown_proxy(proxy, fullurl, data)
 199             else:
 200                 return self.open_unknown(fullurl, data)
 201         try:
 202             if data is None:
 203                 return getattr(self, name)(url)
 204             else:
 205                 return getattr(self, name)(url, data)
 206         except socket.error, msg:
 207             raise IOError, ('socket error', msg), sys.exc_info()[2]
 208
 209     def open_unknown(self, fullurl, data=None):
 210         """Overridable interface to open unknown URL type."""
 211         type, url = splittype(fullurl)
 212         raise IOError, ('url error', 'unknown url type', type)
 213
 214     def open_unknown_proxy(self, proxy, fullurl, data=None):
 215         """Overridable interface to open unknown URL type."""
 216         type, url = splittype(fullurl)
 217         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 218
 219     # External interface
 220     def retrieve(self, url, filename=None, reporthook=None, data=None):
 221         """retrieve(url) returns (filename, headers) for a local object
 222         or (tempfilename, headers) for a remote object."""
 223         url = unwrap(toBytes(url))
 224         if self.tempcache and url in self.tempcache:
 225             return self.tempcache[url]
 226         type, url1 = splittype(url)
 227         if filename is None and (not type or type == 'file'):
 228             try:
 229                 fp = self.open_local_file(url1)
 230                 hdrs = fp.info()
 231                 del fp
 232                 return url2pathname(splithost(url1)[1]), hdrs
 233             except IOError, msg:
 234                 pass
 235         fp = self.open(url, data)
 236         headers = fp.info()
 237         if filename:
 238             tfp = open(filename, 'wb')
 239         else:
 240             import tempfile
 241             garbage, path = splittype(url)
 242             garbage, path = splithost(path or "")
 243             path, garbage = splitquery(path or "")
 244             path, garbage = splitattr(path or "")
 245             suffix = os.path.splitext(path)[1]
 246             (fd, filename) = tempfile.mkstemp(suffix)
 247             self.__tempfiles.append(filename)
 248             tfp = os.fdopen(fd, 'wb')
 249         result = filename, headers
 250         if self.tempcache is not None:
 251             self.tempcache[url] = result
 252         bs = 1024*8
 253         size = -1
 254         read = 0
 255         blocknum = 0
 256         if reporthook:
 257             if "content-length" in headers:
 258                 size = int(headers["Content-Length"])
 259             reporthook(blocknum, bs, size)
 260         while 1:
 261             block = fp.read(bs)
 262             if block == "":
 263                 break
 264             read += len(block)
 265             tfp.write(block)
 266             blocknum += 1
 267             if reporthook:
 268                 reporthook(blocknum, bs, size)
 269         fp.close()
 270         tfp.close()
 271         del fp
 272         del tfp
 273
 274         # raise exception if actual size does not match content-length header
 275         if size >= 0 and read < size:
 276             raise ContentTooShortError("retrieval incomplete: got only %i out "
 277                                        "of %i bytes" % (read, size), result)
 278
 279         return result
 280
 281     # Each method named open_<type> knows how to open that type of URL
 282
 283     def open_http(self, url, data=None):
 284         """Use HTTP protocol."""
 285         import httplib
 286         user_passwd = None
 287         proxy_passwd= None
 288         if isinstance(url, str):
 289             host, selector = splithost(url)
 290             if host:
 291                 user_passwd, host = splituser(host)
 292                 host = unquote(host)
 293             realhost = host
 294         else:
 295             host, selector = url
 296             # check whether the proxy contains authorization information
 297             proxy_passwd, host = splituser(host)
 298             # now we proceed with the url we want to obtain
 299             urltype, rest = splittype(selector)
 300             url = rest
 301             user_passwd = None
 302             if urltype.lower() != 'http':
 303                 realhost = None
 304             else:
 305                 realhost, rest = splithost(rest)
 306                 if realhost:
 307                     user_passwd, realhost = splituser(realhost)
 308                 if user_passwd:
 309                     selector = "%s://%s%s" % (urltype, realhost, rest)
 310                 if proxy_bypass(realhost):
 311                     host = realhost
 312
 313             #print "proxy via http:", host, selector
 314         if not host: raise IOError, ('http error', 'no host given')
 315
 316         if proxy_passwd:
 317             import base64
 318             proxy_auth = base64.b64encode(proxy_passwd).strip()
 319         else:
 320             proxy_auth = None
 321
 322         if user_passwd:
 323             import base64
 324             auth = base64.b64encode(user_passwd).strip()
 325         else:
 326             auth = None
 327         h = httplib.HTTP(host)
 328         if data is not None:
 329             h.putrequest('POST', selector)
 330             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
 331             h.putheader('Content-Length', '%d' % len(data))
 332         else:
 333             h.putrequest('GET', selector)
 334         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 335         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 336         if realhost: h.putheader('Host', realhost)
 337         for args in self.addheaders: h.putheader(*args)
 338         h.endheaders()
 339         if data is not None:
 340             h.send(data)
 341         errcode, errmsg, headers = h.getreply()
 342         fp = h.getfile()
 343         if errcode == -1:
 344             if fp: fp.close()
 345             # something went wrong with the HTTP status line
 346             raise IOError, ('http protocol error', 0,
 347                             'got a bad status line', None)
 348         # According to RFC 2616, "2xx" code indicates that the client's
 349         # request was successfully received, understood, and accepted.
 350         if (200 <= errcode < 300):
 351             return addinfourl(fp, headers, "http:" + url, errcode)
 352         else:
 353             if data is None:
 354                 return self.http_error(url, fp, errcode, errmsg, headers)
 355             else:
 356                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 357
 358     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 359         """Handle http errors.
 360         Derived class can override this, or provide specific handlers
 361         named http_error_DDD where DDD is the 3-digit error code."""
 362         # First check if there's a specific handler for this error
 363         name = 'http_error_%d' % errcode
 364         if hasattr(self, name):
 365             method = getattr(self, name)
 366             if data is None:
 367                 result = method(url, fp, errcode, errmsg, headers)
 368             else:
 369                 result = method(url, fp, errcode, errmsg, headers, data)
 370             if result: return result
 371         return self.http_error_default(url, fp, errcode, errmsg, headers)
 372
 373     def http_error_default(self, url, fp, errcode, errmsg, headers):
 374         """Default error handler: close the connection and raise IOError."""
 375         void = fp.read()
 376         fp.close()
 377         raise IOError, ('http error', errcode, errmsg, headers)
 378
 379     if _have_ssl:
 380         def open_https(self, url, data=None):
 381             """Use HTTPS protocol."""
 382
 383             import httplib
 384             user_passwd = None
 385             proxy_passwd = None
 386             if isinstance(url, str):
 387                 host, selector = splithost(url)
 388                 if host:
 389                     user_passwd, host = splituser(host)
 390                     host = unquote(host)
 391                 realhost = host
 392             else:
 393                 host, selector = url
 394                 # here, we determine, whether the proxy contains authorization information
 395                 proxy_passwd, host = splituser(host)
 396                 urltype, rest = splittype(selector)
 397                 url = rest
 398                 user_passwd = None
 399                 if urltype.lower() != 'https':
 400                     realhost = None
 401                 else:
 402                     realhost, rest = splithost(rest)
 403                     if realhost:
 404                         user_passwd, realhost = splituser(realhost)
 405                     if user_passwd:
 406                         selector = "%s://%s%s" % (urltype, realhost, rest)
 407                 #print "proxy via https:", host, selector
 408             if not host: raise IOError, ('https error', 'no host given')
 409             if proxy_passwd:
 410                 import base64
 411                 proxy_auth = base64.b64encode(proxy_passwd).strip()
 412             else:
 413                 proxy_auth = None
 414             if user_passwd:
 415                 import base64
 416                 auth = base64.b64encode(user_passwd).strip()
 417             else:
 418                 auth = None
 419             h = httplib.HTTPS(host, 0,
 420                               key_file=self.key_file,
 421                               cert_file=self.cert_file)
 422             if data is not None:
 423                 h.putrequest('POST', selector)
 424                 h.putheader('Content-Type',
 425                             'application/x-www-form-urlencoded')
 426                 h.putheader('Content-Length', '%d' % len(data))
 427             else:
 428                 h.putrequest('GET', selector)
 429             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 430             if auth: h.putheader('Authorization', 'Basic %s' % auth)
 431             if realhost: h.putheader('Host', realhost)
 432             for args in self.addheaders: h.putheader(*args)
 433             h.endheaders()
 434             if data is not None:
 435                 h.send(data)
 436             errcode, errmsg, headers = h.getreply()
 437             fp = h.getfile()
 438             if errcode == -1:
 439                 if fp: fp.close()
 440                 # something went wrong with the HTTP status line
 441                 raise IOError, ('http protocol error', 0,
 442                                 'got a bad status line', None)
 443             # According to RFC 2616, "2xx" code indicates that the client's
 444             # request was successfully received, understood, and accepted.
 445             if (200 <= errcode < 300):
 446                 return addinfourl(fp, headers, "https:" + url, errcode)
 447             else:
 448                 if data is None:
 449                     return self.http_error(url, fp, errcode, errmsg, headers)
 450                 else:
 451                     return self.http_error(url, fp, errcode, errmsg, headers,
 452                                            data)
 453
 454     def open_file(self, url):
 455         """Use local file or FTP depending on form of URL."""
 456         if not isinstance(url, str):
 457             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
 458         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
 459             return self.open_ftp(url)
 460         else:
 461             return self.open_local_file(url)
 462
 463     def open_local_file(self, url):
 464         """Use local file."""
 465         import mimetypes, mimetools, email.utils
 466         try:
 467             from cStringIO import StringIO
 468         except ImportError:
 469             from StringIO import StringIO
 470         host, file = splithost(url)
 471         localname = url2pathname(file)
 472         try:
 473             stats = os.stat(localname)
 474         except OSError, e:
 475             raise IOError(e.errno, e.strerror, e.filename)
 476         size = stats.st_size
 477         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
 478         mtype = mimetypes.guess_type(url)[0]
 479         headers = mimetools.Message(StringIO(
 480             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 481             (mtype or 'text/plain', size, modified)))
 482         if not host:
 483             urlfile = file
 484             if file[:1] == '/':
 485                 urlfile = 'file://' + file
 486             return addinfourl(open(localname, 'rb'),
 487                               headers, urlfile)
 488         host, port = splitport(host)
 489         if not port \
 490            and socket.gethostbyname(host) in (localhost(), thishost()):
 491             urlfile = file
 492             if file[:1] == '/':
 493                 urlfile = 'file://' + file
 494             return addinfourl(open(localname, 'rb'),
 495                               headers, urlfile)
 496         raise IOError, ('local file error', 'not on local host')
 497
 498     def open_ftp(self, url):
 499         """Use FTP protocol."""
 500         if not isinstance(url, str):
 501             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
 502         import mimetypes, mimetools
 503         try:
 504             from cStringIO import StringIO
 505         except ImportError:
 506             from StringIO import StringIO
 507         host, path = splithost(url)
 508         if not host: raise IOError, ('ftp error', 'no host given')
 509         host, port = splitport(host)
 510         user, host = splituser(host)
 511         if user: user, passwd = splitpasswd(user)
 512         else: passwd = None
 513         host = unquote(host)
 514         user = unquote(user or '')
 515         passwd = unquote(passwd or '')
 516         host = socket.gethostbyname(host)
 517         if not port:
 518             import ftplib
 519             port = ftplib.FTP_PORT
 520         else:
 521             port = int(port)
 522         path, attrs = splitattr(path)
 523         path = unquote(path)
 524         dirs = path.split('/')
 525         dirs, file = dirs[:-1], dirs[-1]
 526         if dirs and not dirs[0]: dirs = dirs[1:]
 527         if dirs and not dirs[0]: dirs[0] = '/'
 528         key = user, host, port, '/'.join(dirs)
 529         # XXX thread unsafe!
 530         if len(self.ftpcache) > MAXFTPCACHE:
 531             # Prune the cache, rather arbitrarily
 532             for k in self.ftpcache.keys():
 533                 if k != key:
 534                     v = self.ftpcache[k]
 535                     del self.ftpcache[k]
 536                     v.close()
 537         try:
 538             if not key in self.ftpcache:
 539                 self.ftpcache[key] = \
 540                     ftpwrapper(user, passwd, host, port, dirs)
 541             if not file: type = 'D'
 542             else: type = 'I'
 543             for attr in attrs:
 544                 attr, value = splitvalue(attr)
 545                 if attr.lower() == 'type' and \
 546                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 547                     type = value.upper()
 548             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 549             mtype = mimetypes.guess_type("ftp:" + url)[0]
 550             headers = ""
 551             if mtype:
 552                 headers += "Content-Type: %s\n" % mtype
 553             if retrlen is not None and retrlen >= 0:
 554                 headers += "Content-Length: %d\n" % retrlen
 555             headers = mimetools.Message(StringIO(headers))
 556             return addinfourl(fp, headers, "ftp:" + url)
 557         except ftperrors(), msg:
 558             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 559
 560     def open_data(self, url, data=None):
 561         """Use "data" URL."""
 562         if not isinstance(url, str):
 563             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
 564         # ignore POSTed data
 565         #
 566         # syntax of data URLs:
 567         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 568         # mediatype := [ type "/" subtype ] *( ";" parameter )
 569         # data      := *urlchar
 570         # parameter := attribute "=" value
 571         import mimetools
 572         try:
 573             from cStringIO import StringIO
 574         except ImportError:
 575             from StringIO import StringIO
 576         try:
 577             [type, data] = url.split(',', 1)
 578         except ValueError:
 579             raise IOError, ('data error', 'bad data URL')
 580         if not type:
 581             type = 'text/plain;charset=US-ASCII'
 582         semi = type.rfind(';')
 583         if semi >= 0 and '=' not in type[semi:]:
 584             encoding = type[semi+1:]
 585             type = type[:semi]
 586         else:
 587             encoding = ''
 588         msg = []
 589         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 590                                             time.gmtime(time.time())))
 591         msg.append('Content-type: %s' % type)
 592         if encoding == 'base64':
 593             import base64
 594             data = base64.decodestring(data)
 595         else:
 596             data = unquote(data)
 597         msg.append('Content-Length: %d' % len(data))
 598         msg.append('')
 599         msg.append(data)
 600         msg = '\n'.join(msg)
 601         f = StringIO(msg)
 602         headers = mimetools.Message(f, 0)
 603         #f.fileno = None     # needed for addinfourl
 604         return addinfourl(f, headers, url)
 605
 606
 607 class FancyURLopener(URLopener):
 608     """Derived class with handlers for errors we can handle (perhaps)."""
 609
 610     def __init__(self, *args, **kwargs):
 611         URLopener.__init__(self, *args, **kwargs)
 612         self.auth_cache = {}
 613         self.tries = 0
 614         self.maxtries = 10
 615
 616     def http_error_default(self, url, fp, errcode, errmsg, headers):
 617         """Default error handling -- don't raise an exception."""
 618         return addinfourl(fp, headers, "http:" + url, errcode)
 619
 620     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 621         """Error 302 -- relocated (temporarily)."""
 622         self.tries += 1
 623         if self.maxtries and self.tries >= self.maxtries:
 624             if hasattr(self, "http_error_500"):
 625                 meth = self.http_error_500
 626             else:
 627                 meth = self.http_error_default
 628             self.tries = 0
 629             return meth(url, fp, 500,
 630                         "Internal Server Error: Redirect Recursion", headers)
 631         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 632                                         data)
 633         self.tries = 0
 634         return result
 635
 636     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 637         if 'location' in headers:
 638             newurl = headers['location']
 639         elif 'uri' in headers:
 640             newurl = headers['uri']
 641         else:
 642             return
 643         void = fp.read()
 644         fp.close()
 645         # In case the server sent a relative URL, join with original:
 646         newurl = basejoin(self.type + ":" + url, newurl)
 647         return self.open(newurl)
 648
 649     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 650         """Error 301 -- also relocated (permanently)."""
 651         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 652
 653     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
 654         """Error 303 -- also relocated (essentially identical to 302)."""
 655         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 656
 657     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
 658         """Error 307 -- relocated, but turn POST into error."""
 659         if data is None:
 660             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 661         else:
 662             return self.http_error_default(url, fp, errcode, errmsg, headers)
 663
 664     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 665         """Error 401 -- authentication required.
 666         This function supports Basic authentication only."""
 667         if not 'www-authenticate' in headers:
 668             URLopener.http_error_default(self, url, fp,
 669                                          errcode, errmsg, headers)
 670         stuff = headers['www-authenticate']
 671         import re
 672         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 673         if not match:
 674             URLopener.http_error_default(self, url, fp,
 675                                          errcode, errmsg, headers)
 676         scheme, realm = match.groups()
 677         if scheme.lower() != 'basic':
 678             URLopener.http_error_default(self, url, fp,
 679                                          errcode, errmsg, headers)
 680         name = 'retry_' + self.type + '_basic_auth'
 681         if data is None:
 682             return getattr(self,name)(url, realm)
 683         else:
 684             return getattr(self,name)(url, realm, data)
 685
 686     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
 687         """Error 407 -- proxy authentication required.
 688         This function supports Basic authentication only."""
 689         if not 'proxy-authenticate' in headers:
 690             URLopener.http_error_default(self, url, fp,
 691                                          errcode, errmsg, headers)
 692         stuff = headers['proxy-authenticate']
 693         import re
 694         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 695         if not match:
 696             URLopener.http_error_default(self, url, fp,
 697                                          errcode, errmsg, headers)
 698         scheme, realm = match.groups()
 699         if scheme.lower() != 'basic':
 700             URLopener.http_error_default(self, url, fp,
 701                                          errcode, errmsg, headers)
 702         name = 'retry_proxy_' + self.type + '_basic_auth'
 703         if data is None:
 704             return getattr(self,name)(url, realm)
 705         else:
 706             return getattr(self,name)(url, realm, data)
 707
 708     def retry_proxy_http_basic_auth(self, url, realm, data=None):
 709         host, selector = splithost(url)
 710         newurl = 'http://' + host + selector
 711         proxy = self.proxies['http']
 712         urltype, proxyhost = splittype(proxy)
 713         proxyhost, proxyselector = splithost(proxyhost)
 714         i = proxyhost.find('@') + 1
 715         proxyhost = proxyhost[i:]
 716         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 717         if not (user or passwd): return None
 718         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 719         self.proxies['http'] = 'http://' + proxyhost + proxyselector
 720         if data is None:
 721             return self.open(newurl)
 722         else:
 723             return self.open(newurl, data)
 724
 725     def retry_proxy_https_basic_auth(self, url, realm, data=None):
 726         host, selector = splithost(url)
 727         newurl = 'https://' + host + selector
 728         proxy = self.proxies['https']
 729         urltype, proxyhost = splittype(proxy)
 730         proxyhost, proxyselector = splithost(proxyhost)
 731         i = proxyhost.find('@') + 1
 732         proxyhost = proxyhost[i:]
 733         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 734         if not (user or passwd): return None
 735         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 736         self.proxies['https'] = 'https://' + proxyhost + proxyselector
 737         if data is None:
 738             return self.open(newurl)
 739         else:
 740             return self.open(newurl, data)
 741
 742     def retry_http_basic_auth(self, url, realm, data=None):
 743         host, selector = splithost(url)
 744         i = host.find('@') + 1
 745         host = host[i:]
 746         user, passwd = self.get_user_passwd(host, realm, i)
 747         if not (user or passwd): return None
 748         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 749         newurl = 'http://' + host + selector
 750         if data is None:
 751             return self.open(newurl)
 752         else:
 753             return self.open(newurl, data)
 754
 755     def retry_https_basic_auth(self, url, realm, data=None):
 756         host, selector = splithost(url)
 757         i = host.find('@') + 1
 758         host = host[i:]
 759         user, passwd = self.get_user_passwd(host, realm, i)
 760         if not (user or passwd): return None
 761         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 762         newurl = 'https://' + host + selector
 763         if data is None:
 764             return self.open(newurl)
 765         else:
 766             return self.open(newurl, data)
 767
 768     def get_user_passwd(self, host, realm, clear_cache = 0):
 769         key = realm + '@' + host.lower()
 770         if key in self.auth_cache:
 771             if clear_cache:
 772                 del self.auth_cache[key]
 773             else:
 774                 return self.auth_cache[key]
 775         user, passwd = self.prompt_user_passwd(host, realm)
 776         if user or passwd: self.auth_cache[key] = (user, passwd)
 777         return user, passwd
 778
 779     def prompt_user_passwd(self, host, realm):
 780         """Override this in a GUI environment!"""
 781         import getpass
 782         try:
 783             user = raw_input("Enter username for %s at %s: " % (realm,
 784                                                                 host))
 785             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 786                 (user, realm, host))
 787             return user, passwd
 788         except KeyboardInterrupt:
 789             print
 790             return None, None
 791
 792
 793 # Utility functions
 794
 795 _localhost = None
 796 def localhost():
 797     """Return the IP address of the magic hostname 'localhost'."""
 798     global _localhost
 799     if _localhost is None:
 800         _localhost = socket.gethostbyname('localhost')
 801     return _localhost
 802
 803 _thishost = None
 804 def thishost():
 805     """Return the IP address of the current host."""
 806     global _thishost
 807     if _thishost is None:
 808         _thishost = socket.gethostbyname(socket.gethostname())
 809     return _thishost
 810
 811 _ftperrors = None
 812 def ftperrors():
 813     """Return the set of errors raised by the FTP class."""
 814     global _ftperrors
 815     if _ftperrors is None:
 816         import ftplib
 817         _ftperrors = ftplib.all_errors
 818     return _ftperrors
 819
 820 _noheaders = None
 821 def noheaders():
 822     """Return an empty mimetools.Message object."""
 823     global _noheaders
 824     if _noheaders is None:
 825         import mimetools
 826         try:
 827             from cStringIO import StringIO
 828         except ImportError:
 829             from StringIO import StringIO
 830         _noheaders = mimetools.Message(StringIO(), 0)
 831         _noheaders.fp.close()   # Recycle file descriptor
 832     return _noheaders
 833
 834
 835 # Utility classes
 836
 837 class ftpwrapper:
 838     """Class used by open_ftp() for cache of open FTP connections."""
 839
 840     def __init__(self, user, passwd, host, port, dirs,
 841                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 842         self.user = user
 843         self.passwd = passwd
 844         self.host = host
 845         self.port = port
 846         self.dirs = dirs
 847         self.timeout = timeout
 848         self.init()
 849
 850     def init(self):
 851         import ftplib
 852         self.busy = 0
 853         self.ftp = ftplib.FTP()
 854         self.ftp.connect(self.host, self.port, self.timeout)
 855         self.ftp.login(self.user, self.passwd)
 856         for dir in self.dirs:
 857             self.ftp.cwd(dir)
 858
 859     def retrfile(self, file, type):
 860         import ftplib
 861         self.endtransfer()
 862         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 863         else: cmd = 'TYPE ' + type; isdir = 0
 864         try:
 865             self.ftp.voidcmd(cmd)
 866         except ftplib.all_errors:
 867             self.init()
 868             self.ftp.voidcmd(cmd)
 869         conn = None
 870         if file and not isdir:
 871             # Try to retrieve as a file
 872             try:
 873                 cmd = 'RETR ' + file
 874                 conn = self.ftp.ntransfercmd(cmd)
 875             except ftplib.error_perm, reason:
 876                 if str(reason)[:3] != '550':
 877                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 878         if not conn:
 879             # Set transfer mode to ASCII!
 880             self.ftp.voidcmd('TYPE A')
 881             # Try a directory listing. Verify that directory exists.
 882             if file:
 883                 pwd = self.ftp.pwd()
 884                 try:
 885                     try:
 886                         self.ftp.cwd(file)
 887                     except ftplib.error_perm, reason:
 888                         raise IOError, ('ftp error', reason), sys.exc_info()[2]
 889                 finally:
 890                     self.ftp.cwd(pwd)
 891                 cmd = 'LIST ' + file
 892             else:
 893                 cmd = 'LIST'
 894             conn = self.ftp.ntransfercmd(cmd)
 895         self.busy = 1
 896         # Pass back both a suitably decorated object and a retrieval length
 897         return (addclosehook(conn[0].makefile('rb'),
 898                              self.endtransfer), conn[1])
 899     def endtransfer(self):
 900         if not self.busy:
 901             return
 902         self.busy = 0
 903         try:
 904             self.ftp.voidresp()
 905         except ftperrors():
 906             pass
 907
 908     def close(self):
 909         self.endtransfer()
 910         try:
 911             self.ftp.close()
 912         except ftperrors():
 913             pass
 914
 915 class addbase:
 916     """Base class for addinfo and addclosehook."""
 917
 918     def __init__(self, fp):
 919         self.fp = fp
 920         self.read = self.fp.read
 921         self.readline = self.fp.readline
 922         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 923         if hasattr(self.fp, "fileno"):
 924             self.fileno = self.fp.fileno
 925         else:
 926             self.fileno = lambda: None
 927         if hasattr(self.fp, "__iter__"):
 928             self.__iter__ = self.fp.__iter__
 929             if hasattr(self.fp, "next"):
 930                 self.next = self.fp.next
 931
 932     def __repr__(self):
 933         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
 934                                              id(self), self.fp)
 935
 936     def close(self):
 937         self.read = None
 938         self.readline = None
 939         self.readlines = None
 940         self.fileno = None
 941         if self.fp: self.fp.close()
 942         self.fp = None
 943
 944 class addclosehook(addbase):
 945     """Class to add a close hook to an open file."""
 946
 947     def __init__(self, fp, closehook, *hookargs):
 948         addbase.__init__(self, fp)
 949         self.closehook = closehook
 950         self.hookargs = hookargs
 951
 952     def close(self):
 953         addbase.close(self)
 954         if self.closehook:
 955             self.closehook(*self.hookargs)
 956             self.closehook = None
 957             self.hookargs = None
 958
 959 class addinfo(addbase):
 960     """class to add an info() method to an open file."""
 961
 962     def __init__(self, fp, headers):
 963         addbase.__init__(self, fp)
 964         self.headers = headers
 965
 966     def info(self):
 967         return self.headers
 968
 969 class addinfourl(addbase):
 970     """class to add info() and geturl() methods to an open file."""
 971
 972     def __init__(self, fp, headers, url, code=None):
 973         addbase.__init__(self, fp)
 974         self.headers = headers
 975         self.url = url
 976         self.code = code
 977
 978     def info(self):
 979         return self.headers
 980
 981     def getcode(self):
 982         return self.code
 983
 984     def geturl(self):
 985         return self.url
 986
 987
 988 # Utilities to parse URLs (most of these return None for missing parts):
 989 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 990 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 991 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 992 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 993 # splitpasswd('user:passwd') -> 'user', 'passwd'
 994 # splitport('host:port') --> 'host', 'port'
 995 # splitquery('/path?query') --> '/path', 'query'
 996 # splittag('/path#tag') --> '/path', 'tag'
 997 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 998 #   '/path', ['attr1=value1', 'attr2=value2', ...]
 999 # splitvalue('attr=value') --> 'attr', 'value'
1000 # unquote('abc%20def') -> 'abc def'
1001 # quote('abc def') -> 'abc%20def')
1002
1003 try:
1004     unicode
1005 except NameError:
1006     def _is_unicode(x):
1007         return 0
1008 else:
1009     def _is_unicode(x):
1010         return isinstance(x, unicode)
1011
1012 def toBytes(url):
1013     """toBytes(u"URL") --> 'URL'."""
1014     # Most URL schemes require ASCII. If that changes, the conversion
1015     # can be relaxed
1016     if _is_unicode(url):
1017         try:
1018             url = url.encode("ASCII")
1019         except UnicodeError:
1020             raise UnicodeError("URL " + repr(url) +
1021                                " contains non-ASCII characters")
1022     return url
1023
1024 def unwrap(url):
1025     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1026     url = url.strip()
1027     if url[:1] == '<' and url[-1:] == '>':
1028         url = url[1:-1].strip()
1029     if url[:4] == 'URL:': url = url[4:].strip()
1030     return url
1031
1032 _typeprog = None
1033 def splittype(url):
1034     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1035     global _typeprog
1036     if _typeprog is None:
1037         import re
1038         _typeprog = re.compile('^([^/:]+):')
1039
1040     match = _typeprog.match(url)
1041     if match:
1042         scheme = match.group(1)
1043         return scheme.lower(), url[len(scheme) + 1:]
1044     return None, url
1045
1046 _hostprog = None
1047 def splithost(url):
1048     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1049     global _hostprog
1050     if _hostprog is None:
1051         import re
1052         _hostprog = re.compile('^//([^/?]*)(.*)$')
1053
1054     match = _hostprog.match(url)
1055     if match: return match.group(1, 2)
1056     return None, url
1057
1058 _userprog = None
1059 def splituser(host):
1060     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1061     global _userprog
1062     if _userprog is None:
1063         import re
1064         _userprog = re.compile('^(.*)@(.*)$')
1065
1066     match = _userprog.match(host)
1067     if match: return map(unquote, match.group(1, 2))
1068     return None, host
1069
1070 _passwdprog = None
1071 def splitpasswd(user):
1072     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1073     global _passwdprog
1074     if _passwdprog is None:
1075         import re
1076         _passwdprog = re.compile('^([^:]*):(.*)$')
1077
1078     match = _passwdprog.match(user)
1079     if match: return match.group(1, 2)
1080     return user, None
1081
1082 # splittag('/path#tag') --> '/path', 'tag'
1083 _portprog = None
1084 def splitport(host):
1085     """splitport('host:port') --> 'host', 'port'."""
1086     global _portprog
1087     if _portprog is None:
1088         import re
1089         _portprog = re.compile('^(.*):([0-9]+)$')
1090
1091     match = _portprog.match(host)
1092     if match: return match.group(1, 2)
1093     return host, None
1094
1095 _nportprog = None
1096 def splitnport(host, defport=-1):
1097     """Split host and port, returning numeric port.
1098     Return given default port if no ':' found; defaults to -1.
1099     Return numerical port if a valid number are found after ':'.
1100     Return None if ':' but not a valid number."""
1101     global _nportprog
1102     if _nportprog is None:
1103         import re
1104         _nportprog = re.compile('^(.*):(.*)$')
1105
1106     match = _nportprog.match(host)
1107     if match:
1108         host, port = match.group(1, 2)
1109         try:
1110             if not port: raise ValueError, "no digits"
1111             nport = int(port)
1112         except ValueError:
1113             nport = None
1114         return host, nport
1115     return host, defport
1116
1117 _queryprog = None
1118 def splitquery(url):
1119     """splitquery('/path?query') --> '/path', 'query'."""
1120     global _queryprog
1121     if _queryprog is None:
1122         import re
1123         _queryprog = re.compile('^(.*)\?([^?]*)$')
1124
1125     match = _queryprog.match(url)
1126     if match: return match.group(1, 2)
1127     return url, None
1128
1129 _tagprog = None
1130 def splittag(url):
1131     """splittag('/path#tag') --> '/path', 'tag'."""
1132     global _tagprog
1133     if _tagprog is None:
1134         import re
1135         _tagprog = re.compile('^(.*)#([^#]*)$')
1136
1137     match = _tagprog.match(url)
1138     if match: return match.group(1, 2)
1139     return url, None
1140
1141 def splitattr(url):
1142     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1143         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1144     words = url.split(';')
1145     return words[0], words[1:]
1146
1147 _valueprog = None
1148 def splitvalue(attr):
1149     """splitvalue('attr=value') --> 'attr', 'value'."""
1150     global _valueprog
1151     if _valueprog is None:
1152         import re
1153         _valueprog = re.compile('^([^=]*)=(.*)$')
1154
1155     match = _valueprog.match(attr)
1156     if match: return match.group(1, 2)
1157     return attr, None
1158
1159 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1160 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1161
1162 def unquote(s):
1163     """unquote('abc%20def') -> 'abc def'."""
1164     res = s.split('%')
1165     for i in xrange(1, len(res)):
1166         item = res[i]
1167         try:
1168             res[i] = _hextochr[item[:2]] + item[2:]
1169         except KeyError:
1170             res[i] = '%' + item
1171         except UnicodeDecodeError:
1172             res[i] = unichr(int(item[:2], 16)) + item[2:]
1173     return "".join(res)
1174
1175 def unquote_plus(s):
1176     """unquote('%7e/abc+def') -> '~/abc def'"""
1177     s = s.replace('+', ' ')
1178     return unquote(s)
1179
1180 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1181                'abcdefghijklmnopqrstuvwxyz'
1182                '0123456789' '_.-')
1183 _safemaps = {}
1184
1185 def quote(s, safe = '/'):
1186     """quote('abc def') -> 'abc%20def'
1187
1188     Each part of a URL, e.g. the path info, the query, etc., has a
1189     different set of reserved characters that must be quoted.
1190
1191     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1192     the following reserved characters.
1193
1194     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1195                   "$" | ","
1196
1197     Each of these characters is reserved in some component of a URL,
1198     but not necessarily in all of them.
1199
1200     By default, the quote function is intended for quoting the path
1201     section of a URL.  Thus, it will not encode '/'.  This character
1202     is reserved, but in typical usage the quote function is being
1203     called on a path where the existing slash characters are used as
1204     reserved characters.
1205     """
1206     cachekey = (safe, always_safe)
1207     try:
1208         safe_map = _safemaps[cachekey]
1209     except KeyError:
1210         safe += always_safe
1211         safe_map = {}
1212         for i in range(256):
1213             c = chr(i)
1214             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1215         _safemaps[cachekey] = safe_map
1216     res = map(safe_map.__getitem__, s)
1217     return ''.join(res)
1218
1219 def quote_plus(s, safe = ''):
1220     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1221     if ' ' in s:
1222         s = quote(s, safe + ' ')
1223         return s.replace(' ', '+')
1224     return quote(s, safe)
1225
1226 def urlencode(query,doseq=0):
1227     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1228
1229     If any values in the query arg are sequences and doseq is true, each
1230     sequence element is converted to a separate parameter.
1231
1232     If the query arg is a sequence of two-element tuples, the order of the
1233     parameters in the output will match the order of parameters in the
1234     input.
1235     """
1236
1237     if hasattr(query,"items"):
1238         # mapping objects
1239         query = query.items()
1240     else:
1241         # it's a bother at times that strings and string-like objects are
1242         # sequences...
1243         try:
1244             # non-sequence items should not work with len()
1245             # non-empty strings will fail this
1246             if len(query) and not isinstance(query[0], tuple):
1247                 raise TypeError
1248             # zero-length sequences of all types will get here and succeed,
1249             # but that's a minor nit - since the original implementation
1250             # allowed empty dicts that type of behavior probably should be
1251             # preserved for consistency
1252         except TypeError:
1253             ty,va,tb = sys.exc_info()
1254             raise TypeError, "not a valid non-string sequence or mapping object", tb
1255
1256     l = []
1257     if not doseq:
1258         # preserve old behavior
1259         for k, v in query:
1260             k = quote_plus(str(k))
1261             v = quote_plus(str(v))
1262             l.append(k + '=' + v)
1263     else:
1264         for k, v in query:
1265             k = quote_plus(str(k))
1266             if isinstance(v, str):
1267                 v = quote_plus(v)
1268                 l.append(k + '=' + v)
1269             elif _is_unicode(v):
1270                 # is there a reasonable way to convert to ASCII?
1271                 # encode generates a string, but "replace" or "ignore"
1272                 # lose information and "strict" can raise UnicodeError
1273                 v = quote_plus(v.encode("ASCII","replace"))
1274                 l.append(k + '=' + v)
1275             else:
1276                 try:
1277                     # is this a sufficient test for sequence-ness?
1278                     x = len(v)
1279                 except TypeError:
1280                     # not a sequence
1281                     v = quote_plus(str(v))
1282                     l.append(k + '=' + v)
1283                 else:
1284                     # loop over the sequence
1285                     for elt in v:
1286                         l.append(k + '=' + quote_plus(str(elt)))
1287     return '&'.join(l)
1288
1289 # Proxy handling
1290 def getproxies_environment():
1291     """Return a dictionary of scheme -> proxy server URL mappings.
1292
1293     Scan the environment for variables named <scheme>_proxy;
1294     this seems to be the standard convention.  If you need a
1295     different way, you can pass a proxies dictionary to the
1296     [Fancy]URLopener constructor.
1297
1298     """
1299     proxies = {}
1300     for name, value in os.environ.items():
1301         name = name.lower()
1302         if name == 'no_proxy':
1303             # handled in proxy_bypass_environment
1304             continue
1305         if value and name[-6:] == '_proxy':
1306             proxies[name[:-6]] = value
1307     return proxies
1308
1309 def proxy_bypass_environment(host):
1310     """Test if proxies should not be used for a particular host.
1311
1312     Checks the environment for a variable named no_proxy, which should
1313     be a list of DNS suffixes separated by commas, or '*' for all hosts.
1314     """
1315     no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1316     # '*' is special case for always bypass
1317     if no_proxy == '*':
1318         return 1
1319     # strip port off host
1320     hostonly, port = splitport(host)
1321     # check if the host ends with any of the DNS suffixes
1322     for name in no_proxy.split(','):
1323         if name and (hostonly.endswith(name) or host.endswith(name)):
1324             return 1
1325     # otherwise, don't bypass
1326     return 0
1327
1328
1329 if sys.platform == 'darwin':
1330
1331     def _CFSetup(sc):
1332         from ctypes import c_int32, c_void_p, c_char_p, c_int
1333         sc.CFStringCreateWithCString.argtypes = [ c_void_p, c_char_p, c_int32 ]
1334         sc.CFStringCreateWithCString.restype = c_void_p
1335         sc.SCDynamicStoreCopyProxies.argtypes = [ c_void_p ]
1336         sc.SCDynamicStoreCopyProxies.restype = c_void_p
1337         sc.CFDictionaryGetValue.argtypes = [ c_void_p, c_void_p ]
1338         sc.CFDictionaryGetValue.restype = c_void_p
1339         sc.CFStringGetLength.argtypes = [ c_void_p ]
1340         sc.CFStringGetLength.restype = c_int32
1341         sc.CFStringGetCString.argtypes = [ c_void_p, c_char_p, c_int32, c_int32 ]
1342         sc.CFStringGetCString.restype = c_int32
1343         sc.CFNumberGetValue.argtypes = [ c_void_p, c_int, c_void_p ]
1344         sc.CFNumberGetValue.restype = c_int32
1345         sc.CFRelease.argtypes = [ c_void_p ]
1346         sc.CFRelease.restype = None
1347
1348     def _CStringFromCFString(sc, value):
1349         from ctypes import create_string_buffer
1350         length = sc.CFStringGetLength(value) + 1
1351         buff = create_string_buffer(length)
1352         sc.CFStringGetCString(value, buff, length, 0)
1353         return buff.value
1354
1355     def _CFNumberToInt32(sc, cfnum):
1356         from ctypes import byref, c_int
1357         val = c_int()
1358         kCFNumberSInt32Type = 3
1359         sc.CFNumberGetValue(cfnum, kCFNumberSInt32Type, byref(val))
1360         return val.value
1361
1362
1363     def proxy_bypass_macosx_sysconf(host):
1364         """
1365         Return True iff this host shouldn't be accessed using a proxy
1366
1367         This function uses the MacOSX framework SystemConfiguration
1368         to fetch the proxy information.
1369         """
1370         from ctypes import cdll
1371         from ctypes.util import find_library
1372         import re
1373         import socket
1374         from fnmatch import fnmatch
1375
1376         def ip2num(ipAddr):
1377             parts = ipAddr.split('.')
1378             parts = map(int, parts)
1379             if len(parts) != 4:
1380                 parts = (parts + [0, 0, 0, 0])[:4]
1381             return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1382
1383         sc = cdll.LoadLibrary(find_library("SystemConfiguration"))
1384         _CFSetup(sc)
1385
1386         hostIP = None
1387
1388         if not sc:
1389             return False
1390
1391         kSCPropNetProxiesExceptionsList = sc.CFStringCreateWithCString(0, "ExceptionsList", 0)
1392         kSCPropNetProxiesExcludeSimpleHostnames = sc.CFStringCreateWithCString(0,
1393                 "ExcludeSimpleHostnames", 0)
1394
1395
1396         proxyDict = sc.SCDynamicStoreCopyProxies(None)
1397         if proxyDict is None:
1398             return False
1399
1400         try:
1401             # Check for simple host names:
1402             if '.' not in host:
1403                 exclude_simple = sc.CFDictionaryGetValue(proxyDict,
1404                         kSCPropNetProxiesExcludeSimpleHostnames)
1405                 if exclude_simple and _CFNumberToInt32(sc, exclude_simple):
1406                     return True
1407
1408
1409             # Check the exceptions list:
1410             exceptions = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesExceptionsList)
1411             if exceptions:
1412                 # Items in the list are strings like these: *.local, 169.254/16
1413                 for index in xrange(sc.CFArrayGetCount(exceptions)):
1414                     value = sc.CFArrayGetValueAtIndex(exceptions, index)
1415                     if not value: continue
1416                     value = _CStringFromCFString(sc, value)
1417
1418                     m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1419                     if m is not None:
1420                         if hostIP is None:
1421                             hostIP = socket.gethostbyname(host)
1422                             hostIP = ip2num(hostIP)
1423
1424                         base = ip2num(m.group(1))
1425                         mask = int(m.group(2)[1:])
1426                         mask = 32 - mask
1427
1428                         if (hostIP >> mask) == (base >> mask):
1429                             return True
1430
1431                     elif fnmatch(host, value):
1432                         return True
1433
1434             return False
1435
1436         finally:
1437             sc.CFRelease(kSCPropNetProxiesExceptionsList)
1438             sc.CFRelease(kSCPropNetProxiesExcludeSimpleHostnames)
1439
1440
1441
1442     def getproxies_macosx_sysconf():
1443         """Return a dictionary of scheme -> proxy server URL mappings.
1444
1445         This function uses the MacOSX framework SystemConfiguration
1446         to fetch the proxy information.
1447         """
1448         from ctypes import cdll
1449         from ctypes.util import find_library
1450
1451         sc = cdll.LoadLibrary(find_library("SystemConfiguration"))
1452         _CFSetup(sc)
1453
1454         if not sc:
1455             return {}
1456
1457         kSCPropNetProxiesHTTPEnable = sc.CFStringCreateWithCString(0, "HTTPEnable", 0)
1458         kSCPropNetProxiesHTTPProxy = sc.CFStringCreateWithCString(0, "HTTPProxy", 0)
1459         kSCPropNetProxiesHTTPPort = sc.CFStringCreateWithCString(0, "HTTPPort", 0)
1460
1461         kSCPropNetProxiesHTTPSEnable = sc.CFStringCreateWithCString(0, "HTTPSEnable", 0)
1462         kSCPropNetProxiesHTTPSProxy = sc.CFStringCreateWithCString(0, "HTTPSProxy", 0)
1463         kSCPropNetProxiesHTTPSPort = sc.CFStringCreateWithCString(0, "HTTPSPort", 0)
1464
1465         kSCPropNetProxiesFTPEnable = sc.CFStringCreateWithCString(0, "FTPEnable", 0)
1466         kSCPropNetProxiesFTPPassive = sc.CFStringCreateWithCString(0, "FTPPassive", 0)
1467         kSCPropNetProxiesFTPPort = sc.CFStringCreateWithCString(0, "FTPPort", 0)
1468         kSCPropNetProxiesFTPProxy = sc.CFStringCreateWithCString(0, "FTPProxy", 0)
1469
1470         kSCPropNetProxiesGopherEnable = sc.CFStringCreateWithCString(0, "GopherEnable", 0)
1471         kSCPropNetProxiesGopherPort = sc.CFStringCreateWithCString(0, "GopherPort", 0)
1472         kSCPropNetProxiesGopherProxy = sc.CFStringCreateWithCString(0, "GopherProxy", 0)
1473
1474         proxies = {}
1475         proxyDict = sc.SCDynamicStoreCopyProxies(None)
1476
1477         try:
1478             # HTTP:
1479             enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPEnable)
1480             if enabled and _CFNumberToInt32(sc, enabled):
1481                 proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPProxy)
1482                 port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPPort)
1483
1484                 if proxy:
1485                     proxy = _CStringFromCFString(sc, proxy)
1486                     if port:
1487                         port = _CFNumberToInt32(sc, port)
1488                         proxies["http"] = "http://%s:%i" % (proxy, port)
1489                     else:
1490                         proxies["http"] = "http://%s" % (proxy, )
1491
1492             # HTTPS:
1493             enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSEnable)
1494             if enabled and _CFNumberToInt32(sc, enabled):
1495                 proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSProxy)
1496                 port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSPort)
1497
1498                 if proxy:
1499                     proxy = _CStringFromCFString(sc, proxy)
1500                     if port:
1501                         port = _CFNumberToInt32(sc, port)
1502                         proxies["https"] = "http://%s:%i" % (proxy, port)
1503                     else:
1504                         proxies["https"] = "http://%s" % (proxy, )
1505
1506             # FTP:
1507             enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPEnable)
1508             if enabled and _CFNumberToInt32(sc, enabled):
1509                 proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPProxy)
1510                 port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPPort)
1511
1512                 if proxy:
1513                     proxy = _CStringFromCFString(sc, proxy)
1514                     if port:
1515                         port = _CFNumberToInt32(sc, port)
1516                         proxies["ftp"] = "http://%s:%i" % (proxy, port)
1517                     else:
1518                         proxies["ftp"] = "http://%s" % (proxy, )
1519
1520             # Gopher:
1521             enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherEnable)
1522             if enabled and _CFNumberToInt32(sc, enabled):
1523                 proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherProxy)
1524                 port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherPort)
1525
1526                 if proxy:
1527                     proxy = _CStringFromCFString(sc, proxy)
1528                     if port:
1529                         port = _CFNumberToInt32(sc, port)
1530                         proxies["gopher"] = "http://%s:%i" % (proxy, port)
1531                     else:
1532                         proxies["gopher"] = "http://%s" % (proxy, )
1533         finally:
1534             sc.CFRelease(proxyDict)
1535
1536         sc.CFRelease(kSCPropNetProxiesHTTPEnable)
1537         sc.CFRelease(kSCPropNetProxiesHTTPProxy)
1538         sc.CFRelease(kSCPropNetProxiesHTTPPort)
1539         sc.CFRelease(kSCPropNetProxiesFTPEnable)
1540         sc.CFRelease(kSCPropNetProxiesFTPPassive)
1541         sc.CFRelease(kSCPropNetProxiesFTPPort)
1542         sc.CFRelease(kSCPropNetProxiesFTPProxy)
1543         sc.CFRelease(kSCPropNetProxiesGopherEnable)
1544         sc.CFRelease(kSCPropNetProxiesGopherPort)
1545         sc.CFRelease(kSCPropNetProxiesGopherProxy)
1546
1547         return proxies
1548
1549
1550
1551     def proxy_bypass(host):
1552         if getproxies_environment():
1553             return proxy_bypass_environment(host)
1554         else:
1555             return proxy_bypass_macosx_sysconf(host)
1556
1557     def getproxies():
1558         return getproxies_environment() or getproxies_macosx_sysconf()
1559
1560 elif os.name == 'nt':
1561     def getproxies_registry():
1562         """Return a dictionary of scheme -> proxy server URL mappings.
1563
1564         Win32 uses the registry to store proxies.
1565
1566         """
1567         proxies = {}
1568         try:
1569             import _winreg
1570         except ImportError:
1571             # Std module, so should be around - but you never know!
1572             return proxies
1573         try:
1574             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1575                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1576             proxyEnable = _winreg.QueryValueEx(internetSettings,
1577                                                'ProxyEnable')[0]
1578             if proxyEnable:
1579                 # Returned as Unicode but problems if not converted to ASCII
1580                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1581                                                        'ProxyServer')[0])
1582                 if '=' in proxyServer:
1583                     # Per-protocol settings
1584                     for p in proxyServer.split(';'):
1585                         protocol, address = p.split('=', 1)
1586                         # See if address has a type:// prefix
1587                         import re
1588                         if not re.match('^([^/:]+)://', address):
1589                             address = '%s://%s' % (protocol, address)
1590                         proxies[protocol] = address
1591                 else:
1592                     # Use one setting for all protocols
1593                     if proxyServer[:5] == 'http:':
1594                         proxies['http'] = proxyServer
1595                     else:
1596                         proxies['http'] = 'http://%s' % proxyServer
1597                         proxies['ftp'] = 'ftp://%s' % proxyServer
1598             internetSettings.Close()
1599         except (WindowsError, ValueError, TypeError):
1600             # Either registry key not found etc, or the value in an
1601             # unexpected format.
1602             # proxies already set up to be empty so nothing to do
1603             pass
1604         return proxies
1605
1606     def getproxies():
1607         """Return a dictionary of scheme -> proxy server URL mappings.
1608
1609         Returns settings gathered from the environment, if specified,
1610         or the registry.
1611
1612         """
1613         return getproxies_environment() or getproxies_registry()
1614
1615     def proxy_bypass_registry(host):
1616         try:
1617             import _winreg
1618             import re
1619         except ImportError:
1620             # Std modules, so should be around - but you never know!
1621             return 0
1622         try:
1623             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1624                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1625             proxyEnable = _winreg.QueryValueEx(internetSettings,
1626                                                'ProxyEnable')[0]
1627             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1628                                                      'ProxyOverride')[0])
1629             # ^^^^ Returned as Unicode but problems if not converted to ASCII
1630         except WindowsError:
1631             return 0
1632         if not proxyEnable or not proxyOverride:
1633             return 0
1634         # try to make a host list from name and IP address.
1635         rawHost, port = splitport(host)
1636         host = [rawHost]
1637         try:
1638             addr = socket.gethostbyname(rawHost)
1639             if addr != rawHost:
1640                 host.append(addr)
1641         except socket.error:
1642             pass
1643         try:
1644             fqdn = socket.getfqdn(rawHost)
1645             if fqdn != rawHost:
1646                 host.append(fqdn)
1647         except socket.error:
1648             pass
1649         # make a check value list from the registry entry: replace the
1650         # '<local>' string by the localhost entry and the corresponding
1651         # canonical entry.
1652         proxyOverride = proxyOverride.split(';')
1653         i = 0
1654         while i < len(proxyOverride):
1655             if proxyOverride[i] == '<local>':
1656                 proxyOverride[i:i+1] = ['localhost',
1657                                         '127.0.0.1',
1658                                         socket.gethostname(),
1659                                         socket.gethostbyname(
1660                                             socket.gethostname())]
1661             i += 1
1662         # print proxyOverride
1663         # now check if we match one of the registry values.
1664         for test in proxyOverride:
1665             test = test.replace(".", r"\.")     # mask dots
1666             test = test.replace("*", r".*")     # change glob sequence
1667             test = test.replace("?", r".")      # change glob char
1668             for val in host:
1669                 # print "%s <--> %s" %( test, val )
1670                 if re.match(test, val, re.I):
1671                     return 1
1672         return 0
1673
1674     def proxy_bypass(host):
1675         """Return a dictionary of scheme -> proxy server URL mappings.
1676
1677         Returns settings gathered from the environment, if specified,
1678         or the registry.
1679
1680         """
1681         if getproxies_environment():
1682             return proxy_bypass_environment(host)
1683         else:
1684             return proxy_bypass_registry(host)
1685
1686 else:
1687     # By default use environment variables
1688     getproxies = getproxies_environment
1689     proxy_bypass = proxy_bypass_environment
1690
1691 # Test and time quote() and unquote()
1692 def test1():
1693     s = ''
1694     for i in range(256): s = s + chr(i)
1695     s = s*4
1696     t0 = time.time()
1697     qs = quote(s)
1698     uqs = unquote(qs)
1699     t1 = time.time()
1700     if uqs != s:
1701         print 'Wrong!'
1702     print repr(s)
1703     print repr(qs)
1704     print repr(uqs)
1705     print round(t1 - t0, 3), 'sec'
1706
1707
1708 def reporthook(blocknum, blocksize, totalsize):
1709     # Report during remote transfers
1710     print "Block number: %d, Block size: %d, Total size: %d" % (
1711         blocknum, blocksize, totalsize)
1712
1713 # Test program
1714 def test(args=[]):
1715     if not args:
1716         args = [
1717             '/etc/passwd',
1718             'file:/etc/passwd',
1719             'file://localhost/etc/passwd',
1720             'ftp://ftp.gnu.org/pub/README',
1721             'http://www.python.org/index.html',
1722             ]
1723         if hasattr(URLopener, "open_https"):
1724             args.append('https://synergy.as.cmu.edu/~geek/')
1725     try:
1726         for url in args:
1727             print '-'*10, url, '-'*10
1728             fn, h = urlretrieve(url, None, reporthook)
1729             print fn
1730             if h:
1731                 print '======'
1732                 for k in h.keys(): print k + ':', h[k]
1733                 print '======'
1734             fp = open(fn, 'rb')
1735             data = fp.read()
1736             del fp
1737             if '\r' in data:
1738                 table = string.maketrans("", "")
1739                 data = data.translate(table, "\r")
1740             print data
1741             fn, h = None, None
1742         print '-'*40
1743     finally:
1744         urlcleanup()
1745
1746 def main():
1747     import getopt, sys
1748     try:
1749         opts, args = getopt.getopt(sys.argv[1:], "th")
1750     except getopt.error, msg:
1751         print msg
1752         print "Use -h for help"
1753         return
1754     t = 0
1755     for o, a in opts:
1756         if o == '-t':
1757             t = t + 1
1758         if o == '-h':
1759             print "Usage: python urllib.py [-t] [url ...]"
1760             print "-t runs self-test;",
1761             print "otherwise, contents of urls are printed"
1762             return
1763     if t:
1764         if t > 1:
1765             test1()
1766         test(args)
1767     else:
1768         if not args:
1769             print "Use -h for help"
1770         for url in args:
1771             print urlopen(url).read(),
1772
1773 # Run test program when run as a script
1774 if __name__ == '__main__':
1775     main()