Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import time
  29 import sys
  30 from urlparse import urljoin as basejoin
  31
  32 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  33            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  34            "urlencode", "url2pathname", "pathname2url", "splittag",
  35            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  36            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  37            "splitnport", "splitquery", "splitattr", "splitvalue",
  38            "getproxies"]
  39
  40 __version__ = '1.17'    # XXX This version is not always updated :-(
  41
  42 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  43
  44 # Helper for non-unix systems
  45 if os.name == 'mac':
  46     from macurl2path import url2pathname, pathname2url
  47 elif os.name == 'nt':
  48     from nturl2path import url2pathname, pathname2url
  49 elif os.name == 'riscos':
  50     from rourl2path import url2pathname, pathname2url
  51 else:
  52     def url2pathname(pathname):
  53         """OS-specific conversion from a relative URL of the 'file' scheme
  54         to a file system path; not recommended for general use."""
  55         return unquote(pathname)
  56
  57     def pathname2url(pathname):
  58         """OS-specific conversion from a file system path to a relative URL
  59         of the 'file' scheme; not recommended for general use."""
  60         return quote(pathname)
  61
  62 # This really consists of two pieces:
  63 # (1) a class which handles opening of all sorts of URLs
  64 #     (plus assorted utilities etc.)
  65 # (2) a set of functions for parsing URLs
  66 # XXX Should these be separated out into different modules?
  67
  68
  69 # Shortcut for basic usage
  70 _urlopener = None
  71 def urlopen(url, data=None, proxies=None):
  72     """urlopen(url [, data]) -> open file-like object"""
  73     global _urlopener
  74     if proxies is not None:
  75         opener = FancyURLopener(proxies=proxies)
  76     elif not _urlopener:
  77         opener = FancyURLopener()
  78         _urlopener = opener
  79     else:
  80         opener = _urlopener
  81     if data is None:
  82         return opener.open(url)
  83     else:
  84         return opener.open(url, data)
  85 def urlretrieve(url, filename=None, reporthook=None, data=None):
  86     global _urlopener
  87     if not _urlopener:
  88         _urlopener = FancyURLopener()
  89     return _urlopener.retrieve(url, filename, reporthook, data)
  90 def urlcleanup():
  91     if _urlopener:
  92         _urlopener.cleanup()
  93
  94 # exception raised when downloaded size does not match content-length
  95 class ContentTooShortError(IOError):
  96     def __init__(self, message, content):
  97         IOError.__init__(self, message)
  98         self.content = content
  99
 100 ftpcache = {}
 101 class URLopener:
 102     """Class to open URLs.
 103     This is a class rather than just a subroutine because we may need
 104     more than one set of global protocol-specific options.
 105     Note -- this is a base class for those who don't want the
 106     automatic handling of errors type 302 (relocated) and 401
 107     (authorization needed)."""
 108
 109     __tempfiles = None
 110
 111     version = "Python-urllib/%s" % __version__
 112
 113     # Constructor
 114     def __init__(self, proxies=None, **x509):
 115         if proxies is None:
 116             proxies = getproxies()
 117         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 118         self.proxies = proxies
 119         self.key_file = x509.get('key_file')
 120         self.cert_file = x509.get('cert_file')
 121         self.addheaders = [('User-Agent', self.version)]
 122         self.__tempfiles = []
 123         self.__unlink = os.unlink # See cleanup()
 124         self.tempcache = None
 125         # Undocumented feature: if you assign {} to tempcache,
 126         # it is used to cache files retrieved with
 127         # self.retrieve().  This is not enabled by default
 128         # since it does not work for changing documents (and I
 129         # haven't got the logic to check expiration headers
 130         # yet).
 131         self.ftpcache = ftpcache
 132         # Undocumented feature: you can use a different
 133         # ftp cache by assigning to the .ftpcache member;
 134         # in case you want logically independent URL openers
 135         # XXX This is not threadsafe.  Bah.
 136
 137     def __del__(self):
 138         self.close()
 139
 140     def close(self):
 141         self.cleanup()
 142
 143     def cleanup(self):
 144         # This code sometimes runs when the rest of this module
 145         # has already been deleted, so it can't use any globals
 146         # or import anything.
 147         if self.__tempfiles:
 148             for file in self.__tempfiles:
 149                 try:
 150                     self.__unlink(file)
 151                 except OSError:
 152                     pass
 153             del self.__tempfiles[:]
 154         if self.tempcache:
 155             self.tempcache.clear()
 156
 157     def addheader(self, *args):
 158         """Add a header to be used by the HTTP interface only
 159         e.g. u.addheader('Accept', 'sound/basic')"""
 160         self.addheaders.append(args)
 161
 162     # External interface
 163     def open(self, fullurl, data=None):
 164         """Use URLopener().open(file) instead of open(file, 'r')."""
 165         fullurl = unwrap(toBytes(fullurl))
 166         if self.tempcache and fullurl in self.tempcache:
 167             filename, headers = self.tempcache[fullurl]
 168             fp = open(filename, 'rb')
 169             return addinfourl(fp, headers, fullurl)
 170         urltype, url = splittype(fullurl)
 171         if not urltype:
 172             urltype = 'file'
 173         if urltype in self.proxies:
 174             proxy = self.proxies[urltype]
 175             urltype, proxyhost = splittype(proxy)
 176             host, selector = splithost(proxyhost)
 177             url = (host, fullurl) # Signal special case to open_*()
 178         else:
 179             proxy = None
 180         name = 'open_' + urltype
 181         self.type = urltype
 182         name = name.replace('-', '_')
 183         if not hasattr(self, name):
 184             if proxy:
 185                 return self.open_unknown_proxy(proxy, fullurl, data)
 186             else:
 187                 return self.open_unknown(fullurl, data)
 188         try:
 189             if data is None:
 190                 return getattr(self, name)(url)
 191             else:
 192                 return getattr(self, name)(url, data)
 193         except socket.error, msg:
 194             raise IOError, ('socket error', msg), sys.exc_info()[2]
 195
 196     def open_unknown(self, fullurl, data=None):
 197         """Overridable interface to open unknown URL type."""
 198         type, url = splittype(fullurl)
 199         raise IOError, ('url error', 'unknown url type', type)
 200
 201     def open_unknown_proxy(self, proxy, fullurl, data=None):
 202         """Overridable interface to open unknown URL type."""
 203         type, url = splittype(fullurl)
 204         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 205
 206     # External interface
 207     def retrieve(self, url, filename=None, reporthook=None, data=None):
 208         """retrieve(url) returns (filename, headers) for a local object
 209         or (tempfilename, headers) for a remote object."""
 210         url = unwrap(toBytes(url))
 211         if self.tempcache and url in self.tempcache:
 212             return self.tempcache[url]
 213         type, url1 = splittype(url)
 214         if filename is None and (not type or type == 'file'):
 215             try:
 216                 fp = self.open_local_file(url1)
 217                 hdrs = fp.info()
 218                 del fp
 219                 return url2pathname(splithost(url1)[1]), hdrs
 220             except IOError, msg:
 221                 pass
 222         fp = self.open(url, data)
 223         headers = fp.info()
 224         if filename:
 225             tfp = open(filename, 'wb')
 226         else:
 227             import tempfile
 228             garbage, path = splittype(url)
 229             garbage, path = splithost(path or "")
 230             path, garbage = splitquery(path or "")
 231             path, garbage = splitattr(path or "")
 232             suffix = os.path.splitext(path)[1]
 233             (fd, filename) = tempfile.mkstemp(suffix)
 234             self.__tempfiles.append(filename)
 235             tfp = os.fdopen(fd, 'wb')
 236         result = filename, headers
 237         if self.tempcache is not None:
 238             self.tempcache[url] = result
 239         bs = 1024*8
 240         size = -1
 241         read = 0
 242         blocknum = 0
 243         if reporthook:
 244             if "content-length" in headers:
 245                 size = int(headers["Content-Length"])
 246             reporthook(blocknum, bs, size)
 247         while 1:
 248             block = fp.read(bs)
 249             if block == "":
 250                 break
 251             read += len(block)
 252             tfp.write(block)
 253             blocknum += 1
 254             if reporthook:
 255                 reporthook(blocknum, bs, size)
 256         fp.close()
 257         tfp.close()
 258         del fp
 259         del tfp
 260
 261         # raise exception if actual size does not match content-length header
 262         if size >= 0 and read < size:
 263             raise ContentTooShortError("retrieval incomplete: got only %i out "
 264                                        "of %i bytes" % (read, size), result)
 265
 266         return result
 267
 268     # Each method named open_<type> knows how to open that type of URL
 269
 270     def open_http(self, url, data=None):
 271         """Use HTTP protocol."""
 272         import httplib
 273         user_passwd = None
 274         proxy_passwd= None
 275         if isinstance(url, str):
 276             host, selector = splithost(url)
 277             if host:
 278                 user_passwd, host = splituser(host)
 279                 host = unquote(host)
 280             realhost = host
 281         else:
 282             host, selector = url
 283             # check whether the proxy contains authorization information
 284             proxy_passwd, host = splituser(host)
 285             # now we proceed with the url we want to obtain
 286             urltype, rest = splittype(selector)
 287             url = rest
 288             user_passwd = None
 289             if urltype.lower() != 'http':
 290                 realhost = None
 291             else:
 292                 realhost, rest = splithost(rest)
 293                 if realhost:
 294                     user_passwd, realhost = splituser(realhost)
 295                 if user_passwd:
 296                     selector = "%s://%s%s" % (urltype, realhost, rest)
 297                 if proxy_bypass(realhost):
 298                     host = realhost
 299
 300             #print "proxy via http:", host, selector
 301         if not host: raise IOError, ('http error', 'no host given')
 302
 303         if proxy_passwd:
 304             import base64
 305             proxy_auth = base64.b64encode(proxy_passwd).strip()
 306         else:
 307             proxy_auth = None
 308
 309         if user_passwd:
 310             import base64
 311             auth = base64.b64encode(user_passwd).strip()
 312         else:
 313             auth = None
 314         h = httplib.HTTP(host)
 315         if data is not None:
 316             h.putrequest('POST', selector)
 317             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
 318             h.putheader('Content-Length', '%d' % len(data))
 319         else:
 320             h.putrequest('GET', selector)
 321         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 322         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 323         if realhost: h.putheader('Host', realhost)
 324         for args in self.addheaders: h.putheader(*args)
 325         h.endheaders()
 326         if data is not None:
 327             h.send(data)
 328         errcode, errmsg, headers = h.getreply()
 329         fp = h.getfile()
 330         if errcode == -1:
 331             if fp: fp.close()
 332             # something went wrong with the HTTP status line
 333             raise IOError, ('http protocol error', 0,
 334                             'got a bad status line', None)
 335         if errcode == 200:
 336             return addinfourl(fp, headers, "http:" + url)
 337         else:
 338             if data is None:
 339                 return self.http_error(url, fp, errcode, errmsg, headers)
 340             else:
 341                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 342
 343     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 344         """Handle http errors.
 345         Derived class can override this, or provide specific handlers
 346         named http_error_DDD where DDD is the 3-digit error code."""
 347         # First check if there's a specific handler for this error
 348         name = 'http_error_%d' % errcode
 349         if hasattr(self, name):
 350             method = getattr(self, name)
 351             if data is None:
 352                 result = method(url, fp, errcode, errmsg, headers)
 353             else:
 354                 result = method(url, fp, errcode, errmsg, headers, data)
 355             if result: return result
 356         return self.http_error_default(url, fp, errcode, errmsg, headers)
 357
 358     def http_error_default(self, url, fp, errcode, errmsg, headers):
 359         """Default error handler: close the connection and raise IOError."""
 360         void = fp.read()
 361         fp.close()
 362         raise IOError, ('http error', errcode, errmsg, headers)
 363
 364     if hasattr(socket, "ssl"):
 365         def open_https(self, url, data=None):
 366             """Use HTTPS protocol."""
 367             import httplib
 368             user_passwd = None
 369             proxy_passwd = None
 370             if isinstance(url, str):
 371                 host, selector = splithost(url)
 372                 if host:
 373                     user_passwd, host = splituser(host)
 374                     host = unquote(host)
 375                 realhost = host
 376             else:
 377                 host, selector = url
 378                 # here, we determine, whether the proxy contains authorization information
 379                 proxy_passwd, host = splituser(host)
 380                 urltype, rest = splittype(selector)
 381                 url = rest
 382                 user_passwd = None
 383                 if urltype.lower() != 'https':
 384                     realhost = None
 385                 else:
 386                     realhost, rest = splithost(rest)
 387                     if realhost:
 388                         user_passwd, realhost = splituser(realhost)
 389                     if user_passwd:
 390                         selector = "%s://%s%s" % (urltype, realhost, rest)
 391                 #print "proxy via https:", host, selector
 392             if not host: raise IOError, ('https error', 'no host given')
 393             if proxy_passwd:
 394                 import base64
 395                 proxy_auth = base64.b64encode(proxy_passwd).strip()
 396             else:
 397                 proxy_auth = None
 398             if user_passwd:
 399                 import base64
 400                 auth = base64.b64encode(user_passwd).strip()
 401             else:
 402                 auth = None
 403             h = httplib.HTTPS(host, 0,
 404                               key_file=self.key_file,
 405                               cert_file=self.cert_file)
 406             if data is not None:
 407                 h.putrequest('POST', selector)
 408                 h.putheader('Content-Type',
 409                             'application/x-www-form-urlencoded')
 410                 h.putheader('Content-Length', '%d' % len(data))
 411             else:
 412                 h.putrequest('GET', selector)
 413             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 414             if auth: h.putheader('Authorization', 'Basic %s' % auth)
 415             if realhost: h.putheader('Host', realhost)
 416             for args in self.addheaders: h.putheader(*args)
 417             h.endheaders()
 418             if data is not None:
 419                 h.send(data)
 420             errcode, errmsg, headers = h.getreply()
 421             fp = h.getfile()
 422             if errcode == -1:
 423                 if fp: fp.close()
 424                 # something went wrong with the HTTP status line
 425                 raise IOError, ('http protocol error', 0,
 426                                 'got a bad status line', None)
 427             if errcode == 200:
 428                 return addinfourl(fp, headers, "https:" + url)
 429             else:
 430                 if data is None:
 431                     return self.http_error(url, fp, errcode, errmsg, headers)
 432                 else:
 433                     return self.http_error(url, fp, errcode, errmsg, headers,
 434                                            data)
 435
 436     def open_file(self, url):
 437         """Use local file or FTP depending on form of URL."""
 438         if not isinstance(url, str):
 439             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
 440         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
 441             return self.open_ftp(url)
 442         else:
 443             return self.open_local_file(url)
 444
 445     def open_local_file(self, url):
 446         """Use local file."""
 447         import mimetypes, mimetools, email.utils
 448         try:
 449             from cStringIO import StringIO
 450         except ImportError:
 451             from StringIO import StringIO
 452         host, file = splithost(url)
 453         localname = url2pathname(file)
 454         try:
 455             stats = os.stat(localname)
 456         except OSError, e:
 457             raise IOError(e.errno, e.strerror, e.filename)
 458         size = stats.st_size
 459         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
 460         mtype = mimetypes.guess_type(url)[0]
 461         headers = mimetools.Message(StringIO(
 462             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 463             (mtype or 'text/plain', size, modified)))
 464         if not host:
 465             urlfile = file
 466             if file[:1] == '/':
 467                 urlfile = 'file://' + file
 468             return addinfourl(open(localname, 'rb'),
 469                               headers, urlfile)
 470         host, port = splitport(host)
 471         if not port \
 472            and socket.gethostbyname(host) in (localhost(), thishost()):
 473             urlfile = file
 474             if file[:1] == '/':
 475                 urlfile = 'file://' + file
 476             return addinfourl(open(localname, 'rb'),
 477                               headers, urlfile)
 478         raise IOError, ('local file error', 'not on local host')
 479
 480     def open_ftp(self, url):
 481         """Use FTP protocol."""
 482         if not isinstance(url, str):
 483             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
 484         import mimetypes, mimetools
 485         try:
 486             from cStringIO import StringIO
 487         except ImportError:
 488             from StringIO import StringIO
 489         host, path = splithost(url)
 490         if not host: raise IOError, ('ftp error', 'no host given')
 491         host, port = splitport(host)
 492         user, host = splituser(host)
 493         if user: user, passwd = splitpasswd(user)
 494         else: passwd = None
 495         host = unquote(host)
 496         user = unquote(user or '')
 497         passwd = unquote(passwd or '')
 498         host = socket.gethostbyname(host)
 499         if not port:
 500             import ftplib
 501             port = ftplib.FTP_PORT
 502         else:
 503             port = int(port)
 504         path, attrs = splitattr(path)
 505         path = unquote(path)
 506         dirs = path.split('/')
 507         dirs, file = dirs[:-1], dirs[-1]
 508         if dirs and not dirs[0]: dirs = dirs[1:]
 509         if dirs and not dirs[0]: dirs[0] = '/'
 510         key = user, host, port, '/'.join(dirs)
 511         # XXX thread unsafe!
 512         if len(self.ftpcache) > MAXFTPCACHE:
 513             # Prune the cache, rather arbitrarily
 514             for k in self.ftpcache.keys():
 515                 if k != key:
 516                     v = self.ftpcache[k]
 517                     del self.ftpcache[k]
 518                     v.close()
 519         try:
 520             if not key in self.ftpcache:
 521                 self.ftpcache[key] = \
 522                     ftpwrapper(user, passwd, host, port, dirs)
 523             if not file: type = 'D'
 524             else: type = 'I'
 525             for attr in attrs:
 526                 attr, value = splitvalue(attr)
 527                 if attr.lower() == 'type' and \
 528                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 529                     type = value.upper()
 530             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 531             mtype = mimetypes.guess_type("ftp:" + url)[0]
 532             headers = ""
 533             if mtype:
 534                 headers += "Content-Type: %s\n" % mtype
 535             if retrlen is not None and retrlen >= 0:
 536                 headers += "Content-Length: %d\n" % retrlen
 537             headers = mimetools.Message(StringIO(headers))
 538             return addinfourl(fp, headers, "ftp:" + url)
 539         except ftperrors(), msg:
 540             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 541
 542     def open_data(self, url, data=None):
 543         """Use "data" URL."""
 544         if not isinstance(url, str):
 545             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
 546         # ignore POSTed data
 547         #
 548         # syntax of data URLs:
 549         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 550         # mediatype := [ type "/" subtype ] *( ";" parameter )
 551         # data      := *urlchar
 552         # parameter := attribute "=" value
 553         import mimetools
 554         try:
 555             from cStringIO import StringIO
 556         except ImportError:
 557             from StringIO import StringIO
 558         try:
 559             [type, data] = url.split(',', 1)
 560         except ValueError:
 561             raise IOError, ('data error', 'bad data URL')
 562         if not type:
 563             type = 'text/plain;charset=US-ASCII'
 564         semi = type.rfind(';')
 565         if semi >= 0 and '=' not in type[semi:]:
 566             encoding = type[semi+1:]
 567             type = type[:semi]
 568         else:
 569             encoding = ''
 570         msg = []
 571         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 572                                             time.gmtime(time.time())))
 573         msg.append('Content-type: %s' % type)
 574         if encoding == 'base64':
 575             import base64
 576             data = base64.decodestring(data)
 577         else:
 578             data = unquote(data)
 579         msg.append('Content-Length: %d' % len(data))
 580         msg.append('')
 581         msg.append(data)
 582         msg = '\n'.join(msg)
 583         f = StringIO(msg)
 584         headers = mimetools.Message(f, 0)
 585         #f.fileno = None     # needed for addinfourl
 586         return addinfourl(f, headers, url)
 587
 588
 589 class FancyURLopener(URLopener):
 590     """Derived class with handlers for errors we can handle (perhaps)."""
 591
 592     def __init__(self, *args, **kwargs):
 593         URLopener.__init__(self, *args, **kwargs)
 594         self.auth_cache = {}
 595         self.tries = 0
 596         self.maxtries = 10
 597
 598     def http_error_default(self, url, fp, errcode, errmsg, headers):
 599         """Default error handling -- don't raise an exception."""
 600         return addinfourl(fp, headers, "http:" + url)
 601
 602     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 603         """Error 302 -- relocated (temporarily)."""
 604         self.tries += 1
 605         if self.maxtries and self.tries >= self.maxtries:
 606             if hasattr(self, "http_error_500"):
 607                 meth = self.http_error_500
 608             else:
 609                 meth = self.http_error_default
 610             self.tries = 0
 611             return meth(url, fp, 500,
 612                         "Internal Server Error: Redirect Recursion", headers)
 613         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 614                                         data)
 615         self.tries = 0
 616         return result
 617
 618     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 619         if 'location' in headers:
 620             newurl = headers['location']
 621         elif 'uri' in headers:
 622             newurl = headers['uri']
 623         else:
 624             return
 625         void = fp.read()
 626         fp.close()
 627         # In case the server sent a relative URL, join with original:
 628         newurl = basejoin(self.type + ":" + url, newurl)
 629         return self.open(newurl)
 630
 631     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 632         """Error 301 -- also relocated (permanently)."""
 633         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 634
 635     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
 636         """Error 303 -- also relocated (essentially identical to 302)."""
 637         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 638
 639     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
 640         """Error 307 -- relocated, but turn POST into error."""
 641         if data is None:
 642             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 643         else:
 644             return self.http_error_default(url, fp, errcode, errmsg, headers)
 645
 646     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 647         """Error 401 -- authentication required.
 648         This function supports Basic authentication only."""
 649         if not 'www-authenticate' in headers:
 650             URLopener.http_error_default(self, url, fp,
 651                                          errcode, errmsg, headers)
 652         stuff = headers['www-authenticate']
 653         import re
 654         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 655         if not match:
 656             URLopener.http_error_default(self, url, fp,
 657                                          errcode, errmsg, headers)
 658         scheme, realm = match.groups()
 659         if scheme.lower() != 'basic':
 660             URLopener.http_error_default(self, url, fp,
 661                                          errcode, errmsg, headers)
 662         name = 'retry_' + self.type + '_basic_auth'
 663         if data is None:
 664             return getattr(self,name)(url, realm)
 665         else:
 666             return getattr(self,name)(url, realm, data)
 667
 668     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
 669         """Error 407 -- proxy authentication required.
 670         This function supports Basic authentication only."""
 671         if not 'proxy-authenticate' in headers:
 672             URLopener.http_error_default(self, url, fp,
 673                                          errcode, errmsg, headers)
 674         stuff = headers['proxy-authenticate']
 675         import re
 676         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 677         if not match:
 678             URLopener.http_error_default(self, url, fp,
 679                                          errcode, errmsg, headers)
 680         scheme, realm = match.groups()
 681         if scheme.lower() != 'basic':
 682             URLopener.http_error_default(self, url, fp,
 683                                          errcode, errmsg, headers)
 684         name = 'retry_proxy_' + self.type + '_basic_auth'
 685         if data is None:
 686             return getattr(self,name)(url, realm)
 687         else:
 688             return getattr(self,name)(url, realm, data)
 689
 690     def retry_proxy_http_basic_auth(self, url, realm, data=None):
 691         host, selector = splithost(url)
 692         newurl = 'http://' + host + selector
 693         proxy = self.proxies['http']
 694         urltype, proxyhost = splittype(proxy)
 695         proxyhost, proxyselector = splithost(proxyhost)
 696         i = proxyhost.find('@') + 1
 697         proxyhost = proxyhost[i:]
 698         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 699         if not (user or passwd): return None
 700         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 701         self.proxies['http'] = 'http://' + proxyhost + proxyselector
 702         if data is None:
 703             return self.open(newurl)
 704         else:
 705             return self.open(newurl, data)
 706
 707     def retry_proxy_https_basic_auth(self, url, realm, data=None):
 708         host, selector = splithost(url)
 709         newurl = 'https://' + host + selector
 710         proxy = self.proxies['https']
 711         urltype, proxyhost = splittype(proxy)
 712         proxyhost, proxyselector = splithost(proxyhost)
 713         i = proxyhost.find('@') + 1
 714         proxyhost = proxyhost[i:]
 715         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 716         if not (user or passwd): return None
 717         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 718         self.proxies['https'] = 'https://' + proxyhost + proxyselector
 719         if data is None:
 720             return self.open(newurl)
 721         else:
 722             return self.open(newurl, data)
 723
 724     def retry_http_basic_auth(self, url, realm, data=None):
 725         host, selector = splithost(url)
 726         i = host.find('@') + 1
 727         host = host[i:]
 728         user, passwd = self.get_user_passwd(host, realm, i)
 729         if not (user or passwd): return None
 730         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 731         newurl = 'http://' + host + selector
 732         if data is None:
 733             return self.open(newurl)
 734         else:
 735             return self.open(newurl, data)
 736
 737     def retry_https_basic_auth(self, url, realm, data=None):
 738         host, selector = splithost(url)
 739         i = host.find('@') + 1
 740         host = host[i:]
 741         user, passwd = self.get_user_passwd(host, realm, i)
 742         if not (user or passwd): return None
 743         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 744         newurl = 'https://' + host + selector
 745         if data is None:
 746             return self.open(newurl)
 747         else:
 748             return self.open(newurl, data)
 749
 750     def get_user_passwd(self, host, realm, clear_cache = 0):
 751         key = realm + '@' + host.lower()
 752         if key in self.auth_cache:
 753             if clear_cache:
 754                 del self.auth_cache[key]
 755             else:
 756                 return self.auth_cache[key]
 757         user, passwd = self.prompt_user_passwd(host, realm)
 758         if user or passwd: self.auth_cache[key] = (user, passwd)
 759         return user, passwd
 760
 761     def prompt_user_passwd(self, host, realm):
 762         """Override this in a GUI environment!"""
 763         import getpass
 764         try:
 765             user = raw_input("Enter username for %s at %s: " % (realm,
 766                                                                 host))
 767             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 768                 (user, realm, host))
 769             return user, passwd
 770         except KeyboardInterrupt:
 771             print
 772             return None, None
 773
 774
 775 # Utility functions
 776
 777 _localhost = None
 778 def localhost():
 779     """Return the IP address of the magic hostname 'localhost'."""
 780     global _localhost
 781     if _localhost is None:
 782         _localhost = socket.gethostbyname('localhost')
 783     return _localhost
 784
 785 _thishost = None
 786 def thishost():
 787     """Return the IP address of the current host."""
 788     global _thishost
 789     if _thishost is None:
 790         _thishost = socket.gethostbyname(socket.gethostname())
 791     return _thishost
 792
 793 _ftperrors = None
 794 def ftperrors():
 795     """Return the set of errors raised by the FTP class."""
 796     global _ftperrors
 797     if _ftperrors is None:
 798         import ftplib
 799         _ftperrors = ftplib.all_errors
 800     return _ftperrors
 801
 802 _noheaders = None
 803 def noheaders():
 804     """Return an empty mimetools.Message object."""
 805     global _noheaders
 806     if _noheaders is None:
 807         import mimetools
 808         try:
 809             from cStringIO import StringIO
 810         except ImportError:
 811             from StringIO import StringIO
 812         _noheaders = mimetools.Message(StringIO(), 0)
 813         _noheaders.fp.close()   # Recycle file descriptor
 814     return _noheaders
 815
 816
 817 # Utility classes
 818
 819 class ftpwrapper:
 820     """Class used by open_ftp() for cache of open FTP connections."""
 821
 822     def __init__(self, user, passwd, host, port, dirs):
 823         self.user = user
 824         self.passwd = passwd
 825         self.host = host
 826         self.port = port
 827         self.dirs = dirs
 828         self.init()
 829
 830     def init(self):
 831         import ftplib
 832         self.busy = 0
 833         self.ftp = ftplib.FTP()
 834         self.ftp.connect(self.host, self.port)
 835         self.ftp.login(self.user, self.passwd)
 836         for dir in self.dirs:
 837             self.ftp.cwd(dir)
 838
 839     def retrfile(self, file, type):
 840         import ftplib
 841         self.endtransfer()
 842         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 843         else: cmd = 'TYPE ' + type; isdir = 0
 844         try:
 845             self.ftp.voidcmd(cmd)
 846         except ftplib.all_errors:
 847             self.init()
 848             self.ftp.voidcmd(cmd)
 849         conn = None
 850         if file and not isdir:
 851             # Try to retrieve as a file
 852             try:
 853                 cmd = 'RETR ' + file
 854                 conn = self.ftp.ntransfercmd(cmd)
 855             except ftplib.error_perm, reason:
 856                 if str(reason)[:3] != '550':
 857                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 858         if not conn:
 859             # Set transfer mode to ASCII!
 860             self.ftp.voidcmd('TYPE A')
 861             # Try a directory listing
 862             if file: cmd = 'LIST ' + file
 863             else: cmd = 'LIST'
 864             conn = self.ftp.ntransfercmd(cmd)
 865         self.busy = 1
 866         # Pass back both a suitably decorated object and a retrieval length
 867         return (addclosehook(conn[0].makefile('rb'),
 868                              self.endtransfer), conn[1])
 869     def endtransfer(self):
 870         if not self.busy:
 871             return
 872         self.busy = 0
 873         try:
 874             self.ftp.voidresp()
 875         except ftperrors():
 876             pass
 877
 878     def close(self):
 879         self.endtransfer()
 880         try:
 881             self.ftp.close()
 882         except ftperrors():
 883             pass
 884
 885 class addbase:
 886     """Base class for addinfo and addclosehook."""
 887
 888     def __init__(self, fp):
 889         self.fp = fp
 890         self.read = self.fp.read
 891         self.readline = self.fp.readline
 892         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 893         if hasattr(self.fp, "fileno"):
 894             self.fileno = self.fp.fileno
 895         else:
 896             self.fileno = lambda: None
 897         if hasattr(self.fp, "__iter__"):
 898             self.__iter__ = self.fp.__iter__
 899             if hasattr(self.fp, "next"):
 900                 self.next = self.fp.next
 901
 902     def __repr__(self):
 903         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
 904                                              id(self), self.fp)
 905
 906     def close(self):
 907         self.read = None
 908         self.readline = None
 909         self.readlines = None
 910         self.fileno = None
 911         if self.fp: self.fp.close()
 912         self.fp = None
 913
 914 class addclosehook(addbase):
 915     """Class to add a close hook to an open file."""
 916
 917     def __init__(self, fp, closehook, *hookargs):
 918         addbase.__init__(self, fp)
 919         self.closehook = closehook
 920         self.hookargs = hookargs
 921
 922     def close(self):
 923         addbase.close(self)
 924         if self.closehook:
 925             self.closehook(*self.hookargs)
 926             self.closehook = None
 927             self.hookargs = None
 928
 929 class addinfo(addbase):
 930     """class to add an info() method to an open file."""
 931
 932     def __init__(self, fp, headers):
 933         addbase.__init__(self, fp)
 934         self.headers = headers
 935
 936     def info(self):
 937         return self.headers
 938
 939 class addinfourl(addbase):
 940     """class to add info() and geturl() methods to an open file."""
 941
 942     def __init__(self, fp, headers, url):
 943         addbase.__init__(self, fp)
 944         self.headers = headers
 945         self.url = url
 946
 947     def info(self):
 948         return self.headers
 949
 950     def geturl(self):
 951         return self.url
 952
 953
 954 # Utilities to parse URLs (most of these return None for missing parts):
 955 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 956 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 957 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 958 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 959 # splitpasswd('user:passwd') -> 'user', 'passwd'
 960 # splitport('host:port') --> 'host', 'port'
 961 # splitquery('/path?query') --> '/path', 'query'
 962 # splittag('/path#tag') --> '/path', 'tag'
 963 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 964 #   '/path', ['attr1=value1', 'attr2=value2', ...]
 965 # splitvalue('attr=value') --> 'attr', 'value'
 966 # unquote('abc%20def') -> 'abc def'
 967 # quote('abc def') -> 'abc%20def')
 968
 969 try:
 970     unicode
 971 except NameError:
 972     def _is_unicode(x):
 973         return 0
 974 else:
 975     def _is_unicode(x):
 976         return isinstance(x, unicode)
 977
 978 def toBytes(url):
 979     """toBytes(u"URL") --> 'URL'."""
 980     # Most URL schemes require ASCII. If that changes, the conversion
 981     # can be relaxed
 982     if _is_unicode(url):
 983         try:
 984             url = url.encode("ASCII")
 985         except UnicodeError:
 986             raise UnicodeError("URL " + repr(url) +
 987                                " contains non-ASCII characters")
 988     return url
 989
 990 def unwrap(url):
 991     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
 992     url = url.strip()
 993     if url[:1] == '<' and url[-1:] == '>':
 994         url = url[1:-1].strip()
 995     if url[:4] == 'URL:': url = url[4:].strip()
 996     return url
 997
 998 _typeprog = None
 999 def splittype(url):
1000     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1001     global _typeprog
1002     if _typeprog is None:
1003         import re
1004         _typeprog = re.compile('^([^/:]+):')
1005
1006     match = _typeprog.match(url)
1007     if match:
1008         scheme = match.group(1)
1009         return scheme.lower(), url[len(scheme) + 1:]
1010     return None, url
1011
1012 _hostprog = None
1013 def splithost(url):
1014     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1015     global _hostprog
1016     if _hostprog is None:
1017         import re
1018         _hostprog = re.compile('^//([^/?]*)(.*)$')
1019
1020     match = _hostprog.match(url)
1021     if match: return match.group(1, 2)
1022     return None, url
1023
1024 _userprog = None
1025 def splituser(host):
1026     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1027     global _userprog
1028     if _userprog is None:
1029         import re
1030         _userprog = re.compile('^(.*)@(.*)$')
1031
1032     match = _userprog.match(host)
1033     if match: return map(unquote, match.group(1, 2))
1034     return None, host
1035
1036 _passwdprog = None
1037 def splitpasswd(user):
1038     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1039     global _passwdprog
1040     if _passwdprog is None:
1041         import re
1042         _passwdprog = re.compile('^([^:]*):(.*)$')
1043
1044     match = _passwdprog.match(user)
1045     if match: return match.group(1, 2)
1046     return user, None
1047
1048 # splittag('/path#tag') --> '/path', 'tag'
1049 _portprog = None
1050 def splitport(host):
1051     """splitport('host:port') --> 'host', 'port'."""
1052     global _portprog
1053     if _portprog is None:
1054         import re
1055         _portprog = re.compile('^(.*):([0-9]+)$')
1056
1057     match = _portprog.match(host)
1058     if match: return match.group(1, 2)
1059     return host, None
1060
1061 _nportprog = None
1062 def splitnport(host, defport=-1):
1063     """Split host and port, returning numeric port.
1064     Return given default port if no ':' found; defaults to -1.
1065     Return numerical port if a valid number are found after ':'.
1066     Return None if ':' but not a valid number."""
1067     global _nportprog
1068     if _nportprog is None:
1069         import re
1070         _nportprog = re.compile('^(.*):(.*)$')
1071
1072     match = _nportprog.match(host)
1073     if match:
1074         host, port = match.group(1, 2)
1075         try:
1076             if not port: raise ValueError, "no digits"
1077             nport = int(port)
1078         except ValueError:
1079             nport = None
1080         return host, nport
1081     return host, defport
1082
1083 _queryprog = None
1084 def splitquery(url):
1085     """splitquery('/path?query') --> '/path', 'query'."""
1086     global _queryprog
1087     if _queryprog is None:
1088         import re
1089         _queryprog = re.compile('^(.*)\?([^?]*)$')
1090
1091     match = _queryprog.match(url)
1092     if match: return match.group(1, 2)
1093     return url, None
1094
1095 _tagprog = None
1096 def splittag(url):
1097     """splittag('/path#tag') --> '/path', 'tag'."""
1098     global _tagprog
1099     if _tagprog is None:
1100         import re
1101         _tagprog = re.compile('^(.*)#([^#]*)$')
1102
1103     match = _tagprog.match(url)
1104     if match: return match.group(1, 2)
1105     return url, None
1106
1107 def splitattr(url):
1108     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1109         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1110     words = url.split(';')
1111     return words[0], words[1:]
1112
1113 _valueprog = None
1114 def splitvalue(attr):
1115     """splitvalue('attr=value') --> 'attr', 'value'."""
1116     global _valueprog
1117     if _valueprog is None:
1118         import re
1119         _valueprog = re.compile('^([^=]*)=(.*)$')
1120
1121     match = _valueprog.match(attr)
1122     if match: return match.group(1, 2)
1123     return attr, None
1124
1125 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1126 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1127
1128 def unquote(s):
1129     """unquote('abc%20def') -> 'abc def'."""
1130     res = s.split('%')
1131     for i in xrange(1, len(res)):
1132         item = res[i]
1133         try:
1134             res[i] = _hextochr[item[:2]] + item[2:]
1135         except KeyError:
1136             res[i] = '%' + item
1137         except UnicodeDecodeError:
1138             res[i] = unichr(int(item[:2], 16)) + item[2:]
1139     return "".join(res)
1140
1141 def unquote_plus(s):
1142     """unquote('%7e/abc+def') -> '~/abc def'"""
1143     s = s.replace('+', ' ')
1144     return unquote(s)
1145
1146 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1147                'abcdefghijklmnopqrstuvwxyz'
1148                '0123456789' '_.-')
1149 _safemaps = {}
1150
1151 def quote(s, safe = '/'):
1152     """quote('abc def') -> 'abc%20def'
1153
1154     Each part of a URL, e.g. the path info, the query, etc., has a
1155     different set of reserved characters that must be quoted.
1156
1157     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1158     the following reserved characters.
1159
1160     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1161                   "$" | ","
1162
1163     Each of these characters is reserved in some component of a URL,
1164     but not necessarily in all of them.
1165
1166     By default, the quote function is intended for quoting the path
1167     section of a URL.  Thus, it will not encode '/'.  This character
1168     is reserved, but in typical usage the quote function is being
1169     called on a path where the existing slash characters are used as
1170     reserved characters.
1171     """
1172     cachekey = (safe, always_safe)
1173     try:
1174         safe_map = _safemaps[cachekey]
1175     except KeyError:
1176         safe += always_safe
1177         safe_map = {}
1178         for i in range(256):
1179             c = chr(i)
1180             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1181         _safemaps[cachekey] = safe_map
1182     res = map(safe_map.__getitem__, s)
1183     return ''.join(res)
1184
1185 def quote_plus(s, safe = ''):
1186     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1187     if ' ' in s:
1188         s = quote(s, safe + ' ')
1189         return s.replace(' ', '+')
1190     return quote(s, safe)
1191
1192 def urlencode(query,doseq=0):
1193     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1194
1195     If any values in the query arg are sequences and doseq is true, each
1196     sequence element is converted to a separate parameter.
1197
1198     If the query arg is a sequence of two-element tuples, the order of the
1199     parameters in the output will match the order of parameters in the
1200     input.
1201     """
1202
1203     if hasattr(query,"items"):
1204         # mapping objects
1205         query = query.items()
1206     else:
1207         # it's a bother at times that strings and string-like objects are
1208         # sequences...
1209         try:
1210             # non-sequence items should not work with len()
1211             # non-empty strings will fail this
1212             if len(query) and not isinstance(query[0], tuple):
1213                 raise TypeError
1214             # zero-length sequences of all types will get here and succeed,
1215             # but that's a minor nit - since the original implementation
1216             # allowed empty dicts that type of behavior probably should be
1217             # preserved for consistency
1218         except TypeError:
1219             ty,va,tb = sys.exc_info()
1220             raise TypeError, "not a valid non-string sequence or mapping object", tb
1221
1222     l = []
1223     if not doseq:
1224         # preserve old behavior
1225         for k, v in query:
1226             k = quote_plus(str(k))
1227             v = quote_plus(str(v))
1228             l.append(k + '=' + v)
1229     else:
1230         for k, v in query:
1231             k = quote_plus(str(k))
1232             if isinstance(v, str):
1233                 v = quote_plus(v)
1234                 l.append(k + '=' + v)
1235             elif _is_unicode(v):
1236                 # is there a reasonable way to convert to ASCII?
1237                 # encode generates a string, but "replace" or "ignore"
1238                 # lose information and "strict" can raise UnicodeError
1239                 v = quote_plus(v.encode("ASCII","replace"))
1240                 l.append(k + '=' + v)
1241             else:
1242                 try:
1243                     # is this a sufficient test for sequence-ness?
1244                     x = len(v)
1245                 except TypeError:
1246                     # not a sequence
1247                     v = quote_plus(str(v))
1248                     l.append(k + '=' + v)
1249                 else:
1250                     # loop over the sequence
1251                     for elt in v:
1252                         l.append(k + '=' + quote_plus(str(elt)))
1253     return '&'.join(l)
1254
1255 # Proxy handling
1256 def getproxies_environment():
1257     """Return a dictionary of scheme -> proxy server URL mappings.
1258
1259     Scan the environment for variables named <scheme>_proxy;
1260     this seems to be the standard convention.  If you need a
1261     different way, you can pass a proxies dictionary to the
1262     [Fancy]URLopener constructor.
1263
1264     """
1265     proxies = {}
1266     for name, value in os.environ.items():
1267         name = name.lower()
1268         if value and name[-6:] == '_proxy':
1269             proxies[name[:-6]] = value
1270     return proxies
1271
1272 if sys.platform == 'darwin':
1273     def getproxies_internetconfig():
1274         """Return a dictionary of scheme -> proxy server URL mappings.
1275
1276         By convention the mac uses Internet Config to store
1277         proxies.  An HTTP proxy, for instance, is stored under
1278         the HttpProxy key.
1279
1280         """
1281         try:
1282             import ic
1283         except ImportError:
1284             return {}
1285
1286         try:
1287             config = ic.IC()
1288         except ic.error:
1289             return {}
1290         proxies = {}
1291         # HTTP:
1292         if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1293             try:
1294                 value = config['HTTPProxyHost']
1295             except ic.error:
1296                 pass
1297             else:
1298                 proxies['http'] = 'http://%s' % value
1299         # FTP: XXXX To be done.
1300         # Gopher: XXXX To be done.
1301         return proxies
1302
1303     def proxy_bypass(x):
1304         return 0
1305
1306     def getproxies():
1307         return getproxies_environment() or getproxies_internetconfig()
1308
1309 elif os.name == 'nt':
1310     def getproxies_registry():
1311         """Return a dictionary of scheme -> proxy server URL mappings.
1312
1313         Win32 uses the registry to store proxies.
1314
1315         """
1316         proxies = {}
1317         try:
1318             import _winreg
1319         except ImportError:
1320             # Std module, so should be around - but you never know!
1321             return proxies
1322         try:
1323             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1324                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1325             proxyEnable = _winreg.QueryValueEx(internetSettings,
1326                                                'ProxyEnable')[0]
1327             if proxyEnable:
1328                 # Returned as Unicode but problems if not converted to ASCII
1329                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1330                                                        'ProxyServer')[0])
1331                 if '=' in proxyServer:
1332                     # Per-protocol settings
1333                     for p in proxyServer.split(';'):
1334                         protocol, address = p.split('=', 1)
1335                         # See if address has a type:// prefix
1336                         import re
1337                         if not re.match('^([^/:]+)://', address):
1338                             address = '%s://%s' % (protocol, address)
1339                         proxies[protocol] = address
1340                 else:
1341                     # Use one setting for all protocols
1342                     if proxyServer[:5] == 'http:':
1343                         proxies['http'] = proxyServer
1344                     else:
1345                         proxies['http'] = 'http://%s' % proxyServer
1346                         proxies['ftp'] = 'ftp://%s' % proxyServer
1347             internetSettings.Close()
1348         except (WindowsError, ValueError, TypeError):
1349             # Either registry key not found etc, or the value in an
1350             # unexpected format.
1351             # proxies already set up to be empty so nothing to do
1352             pass
1353         return proxies
1354
1355     def getproxies():
1356         """Return a dictionary of scheme -> proxy server URL mappings.
1357
1358         Returns settings gathered from the environment, if specified,
1359         or the registry.
1360
1361         """
1362         return getproxies_environment() or getproxies_registry()
1363
1364     def proxy_bypass(host):
1365         try:
1366             import _winreg
1367             import re
1368         except ImportError:
1369             # Std modules, so should be around - but you never know!
1370             return 0
1371         try:
1372             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1373                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1374             proxyEnable = _winreg.QueryValueEx(internetSettings,
1375                                                'ProxyEnable')[0]
1376             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1377                                                      'ProxyOverride')[0])
1378             # ^^^^ Returned as Unicode but problems if not converted to ASCII
1379         except WindowsError:
1380             return 0
1381         if not proxyEnable or not proxyOverride:
1382             return 0
1383         # try to make a host list from name and IP address.
1384         rawHost, port = splitport(host)
1385         host = [rawHost]
1386         try:
1387             addr = socket.gethostbyname(rawHost)
1388             if addr != rawHost:
1389                 host.append(addr)
1390         except socket.error:
1391             pass
1392         try:
1393             fqdn = socket.getfqdn(rawHost)
1394             if fqdn != rawHost:
1395                 host.append(fqdn)
1396         except socket.error:
1397             pass
1398         # make a check value list from the registry entry: replace the
1399         # '<local>' string by the localhost entry and the corresponding
1400         # canonical entry.
1401         proxyOverride = proxyOverride.split(';')
1402         i = 0
1403         while i < len(proxyOverride):
1404             if proxyOverride[i] == '<local>':
1405                 proxyOverride[i:i+1] = ['localhost',
1406                                         '127.0.0.1',
1407                                         socket.gethostname(),
1408                                         socket.gethostbyname(
1409                                             socket.gethostname())]
1410             i += 1
1411         # print proxyOverride
1412         # now check if we match one of the registry values.
1413         for test in proxyOverride:
1414             test = test.replace(".", r"\.")     # mask dots
1415             test = test.replace("*", r".*")     # change glob sequence
1416             test = test.replace("?", r".")      # change glob char
1417             for val in host:
1418                 # print "%s <--> %s" %( test, val )
1419                 if re.match(test, val, re.I):
1420                     return 1
1421         return 0
1422
1423 else:
1424     # By default use environment variables
1425     getproxies = getproxies_environment
1426
1427     def proxy_bypass(host):
1428         return 0
1429
1430 # Test and time quote() and unquote()
1431 def test1():
1432     s = ''
1433     for i in range(256): s = s + chr(i)
1434     s = s*4
1435     t0 = time.time()
1436     qs = quote(s)
1437     uqs = unquote(qs)
1438     t1 = time.time()
1439     if uqs != s:
1440         print 'Wrong!'
1441     print repr(s)
1442     print repr(qs)
1443     print repr(uqs)
1444     print round(t1 - t0, 3), 'sec'
1445
1446
1447 def reporthook(blocknum, blocksize, totalsize):
1448     # Report during remote transfers
1449     print "Block number: %d, Block size: %d, Total size: %d" % (
1450         blocknum, blocksize, totalsize)
1451
1452 # Test program
1453 def test(args=[]):
1454     if not args:
1455         args = [
1456             '/etc/passwd',
1457             'file:/etc/passwd',
1458             'file://localhost/etc/passwd',
1459             'ftp://ftp.gnu.org/pub/README',
1460             'http://www.python.org/index.html',
1461             ]
1462         if hasattr(URLopener, "open_https"):
1463             args.append('https://synergy.as.cmu.edu/~geek/')
1464     try:
1465         for url in args:
1466             print '-'*10, url, '-'*10
1467             fn, h = urlretrieve(url, None, reporthook)
1468             print fn
1469             if h:
1470                 print '======'
1471                 for k in h.keys(): print k + ':', h[k]
1472                 print '======'
1473             fp = open(fn, 'rb')
1474             data = fp.read()
1475             del fp
1476             if '\r' in data:
1477                 table = string.maketrans("", "")
1478                 data = data.translate(table, "\r")
1479             print data
1480             fn, h = None, None
1481         print '-'*40
1482     finally:
1483         urlcleanup()
1484
1485 def main():
1486     import getopt, sys
1487     try:
1488         opts, args = getopt.getopt(sys.argv[1:], "th")
1489     except getopt.error, msg:
1490         print msg
1491         print "Use -h for help"
1492         return
1493     t = 0
1494     for o, a in opts:
1495         if o == '-t':
1496             t = t + 1
1497         if o == '-h':
1498             print "Usage: python urllib.py [-t] [url ...]"
1499             print "-t runs self-test;",
1500             print "otherwise, contents of urls are printed"
1501             return
1502     if t:
1503         if t > 1:
1504             test1()
1505         test(args)
1506     else:
1507         if not args:
1508             print "Use -h for help"
1509         for url in args:
1510             print urlopen(url).read(),
1511
1512 # Run test program when run as a script
1513 if __name__ == '__main__':
1514     main()