Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import time
  29 import sys
  30 from urlparse import urljoin as basejoin
  31
  32 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  33            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  34            "urlencode", "url2pathname", "pathname2url", "splittag",
  35            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  36            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  37            "splitnport", "splitquery", "splitattr", "splitvalue",
  38            "getproxies"]
  39
  40 __version__ = '1.17'    # XXX This version is not always updated :-(
  41
  42 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  43
  44 # Helper for non-unix systems
  45 if os.name == 'mac':
  46     from macurl2path import url2pathname, pathname2url
  47 elif os.name == 'nt':
  48     from nturl2path import url2pathname, pathname2url
  49 elif os.name == 'riscos':
  50     from rourl2path import url2pathname, pathname2url
  51 else:
  52     def url2pathname(pathname):
  53         """OS-specific conversion from a relative URL of the 'file' scheme
  54         to a file system path; not recommended for general use."""
  55         return unquote(pathname)
  56
  57     def pathname2url(pathname):
  58         """OS-specific conversion from a file system path to a relative URL
  59         of the 'file' scheme; not recommended for general use."""
  60         return quote(pathname)
  61
  62 # This really consists of two pieces:
  63 # (1) a class which handles opening of all sorts of URLs
  64 #     (plus assorted utilities etc.)
  65 # (2) a set of functions for parsing URLs
  66 # XXX Should these be separated out into different modules?
  67
  68
  69 # Shortcut for basic usage
  70 _urlopener = None
  71 def urlopen(url, data=None, proxies=None):
  72     """urlopen(url [, data]) -> open file-like object"""
  73     global _urlopener
  74     if proxies is not None:
  75         opener = FancyURLopener(proxies=proxies)
  76     elif not _urlopener:
  77         opener = FancyURLopener()
  78         _urlopener = opener
  79     else:
  80         opener = _urlopener
  81     if data is None:
  82         return opener.open(url)
  83     else:
  84         return opener.open(url, data)
  85 def urlretrieve(url, filename=None, reporthook=None, data=None):
  86     global _urlopener
  87     if not _urlopener:
  88         _urlopener = FancyURLopener()
  89     return _urlopener.retrieve(url, filename, reporthook, data)
  90 def urlcleanup():
  91     if _urlopener:
  92         _urlopener.cleanup()
  93
  94 # check for SSL
  95 try:
  96     import ssl
  97 except:
  98     _have_ssl = False
  99 else:
 100     _have_ssl = True
 101
 102 # exception raised when downloaded size does not match content-length
 103 class ContentTooShortError(IOError):
 104     def __init__(self, message, content):
 105         IOError.__init__(self, message)
 106         self.content = content
 107
 108 ftpcache = {}
 109 class URLopener:
 110     """Class to open URLs.
 111     This is a class rather than just a subroutine because we may need
 112     more than one set of global protocol-specific options.
 113     Note -- this is a base class for those who don't want the
 114     automatic handling of errors type 302 (relocated) and 401
 115     (authorization needed)."""
 116
 117     __tempfiles = None
 118
 119     version = "Python-urllib/%s" % __version__
 120
 121     # Constructor
 122     def __init__(self, proxies=None, **x509):
 123         if proxies is None:
 124             proxies = getproxies()
 125         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 126         self.proxies = proxies
 127         self.key_file = x509.get('key_file')
 128         self.cert_file = x509.get('cert_file')
 129         self.addheaders = [('User-Agent', self.version)]
 130         self.__tempfiles = []
 131         self.__unlink = os.unlink # See cleanup()
 132         self.tempcache = None
 133         # Undocumented feature: if you assign {} to tempcache,
 134         # it is used to cache files retrieved with
 135         # self.retrieve().  This is not enabled by default
 136         # since it does not work for changing documents (and I
 137         # haven't got the logic to check expiration headers
 138         # yet).
 139         self.ftpcache = ftpcache
 140         # Undocumented feature: you can use a different
 141         # ftp cache by assigning to the .ftpcache member;
 142         # in case you want logically independent URL openers
 143         # XXX This is not threadsafe.  Bah.
 144
 145     def __del__(self):
 146         self.close()
 147
 148     def close(self):
 149         self.cleanup()
 150
 151     def cleanup(self):
 152         # This code sometimes runs when the rest of this module
 153         # has already been deleted, so it can't use any globals
 154         # or import anything.
 155         if self.__tempfiles:
 156             for file in self.__tempfiles:
 157                 try:
 158                     self.__unlink(file)
 159                 except OSError:
 160                     pass
 161             del self.__tempfiles[:]
 162         if self.tempcache:
 163             self.tempcache.clear()
 164
 165     def addheader(self, *args):
 166         """Add a header to be used by the HTTP interface only
 167         e.g. u.addheader('Accept', 'sound/basic')"""
 168         self.addheaders.append(args)
 169
 170     # External interface
 171     def open(self, fullurl, data=None):
 172         """Use URLopener().open(file) instead of open(file, 'r')."""
 173         fullurl = unwrap(toBytes(fullurl))
 174         if self.tempcache and fullurl in self.tempcache:
 175             filename, headers = self.tempcache[fullurl]
 176             fp = open(filename, 'rb')
 177             return addinfourl(fp, headers, fullurl)
 178         urltype, url = splittype(fullurl)
 179         if not urltype:
 180             urltype = 'file'
 181         if urltype in self.proxies:
 182             proxy = self.proxies[urltype]
 183             urltype, proxyhost = splittype(proxy)
 184             host, selector = splithost(proxyhost)
 185             url = (host, fullurl) # Signal special case to open_*()
 186         else:
 187             proxy = None
 188         name = 'open_' + urltype
 189         self.type = urltype
 190         name = name.replace('-', '_')
 191         if not hasattr(self, name):
 192             if proxy:
 193                 return self.open_unknown_proxy(proxy, fullurl, data)
 194             else:
 195                 return self.open_unknown(fullurl, data)
 196         try:
 197             if data is None:
 198                 return getattr(self, name)(url)
 199             else:
 200                 return getattr(self, name)(url, data)
 201         except socket.error, msg:
 202             raise IOError, ('socket error', msg), sys.exc_info()[2]
 203
 204     def open_unknown(self, fullurl, data=None):
 205         """Overridable interface to open unknown URL type."""
 206         type, url = splittype(fullurl)
 207         raise IOError, ('url error', 'unknown url type', type)
 208
 209     def open_unknown_proxy(self, proxy, fullurl, data=None):
 210         """Overridable interface to open unknown URL type."""
 211         type, url = splittype(fullurl)
 212         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 213
 214     # External interface
 215     def retrieve(self, url, filename=None, reporthook=None, data=None):
 216         """retrieve(url) returns (filename, headers) for a local object
 217         or (tempfilename, headers) for a remote object."""
 218         url = unwrap(toBytes(url))
 219         if self.tempcache and url in self.tempcache:
 220             return self.tempcache[url]
 221         type, url1 = splittype(url)
 222         if filename is None and (not type or type == 'file'):
 223             try:
 224                 fp = self.open_local_file(url1)
 225                 hdrs = fp.info()
 226                 del fp
 227                 return url2pathname(splithost(url1)[1]), hdrs
 228             except IOError, msg:
 229                 pass
 230         fp = self.open(url, data)
 231         headers = fp.info()
 232         if filename:
 233             tfp = open(filename, 'wb')
 234         else:
 235             import tempfile
 236             garbage, path = splittype(url)
 237             garbage, path = splithost(path or "")
 238             path, garbage = splitquery(path or "")
 239             path, garbage = splitattr(path or "")
 240             suffix = os.path.splitext(path)[1]
 241             (fd, filename) = tempfile.mkstemp(suffix)
 242             self.__tempfiles.append(filename)
 243             tfp = os.fdopen(fd, 'wb')
 244         result = filename, headers
 245         if self.tempcache is not None:
 246             self.tempcache[url] = result
 247         bs = 1024*8
 248         size = -1
 249         read = 0
 250         blocknum = 0
 251         if reporthook:
 252             if "content-length" in headers:
 253                 size = int(headers["Content-Length"])
 254             reporthook(blocknum, bs, size)
 255         while 1:
 256             block = fp.read(bs)
 257             if block == "":
 258                 break
 259             read += len(block)
 260             tfp.write(block)
 261             blocknum += 1
 262             if reporthook:
 263                 reporthook(blocknum, bs, size)
 264         fp.close()
 265         tfp.close()
 266         del fp
 267         del tfp
 268
 269         # raise exception if actual size does not match content-length header
 270         if size >= 0 and read < size:
 271             raise ContentTooShortError("retrieval incomplete: got only %i out "
 272                                        "of %i bytes" % (read, size), result)
 273
 274         return result
 275
 276     # Each method named open_<type> knows how to open that type of URL
 277
 278     def open_http(self, url, data=None):
 279         """Use HTTP protocol."""
 280         import httplib
 281         user_passwd = None
 282         proxy_passwd= None
 283         if isinstance(url, str):
 284             host, selector = splithost(url)
 285             if host:
 286                 user_passwd, host = splituser(host)
 287                 host = unquote(host)
 288             realhost = host
 289         else:
 290             host, selector = url
 291             # check whether the proxy contains authorization information
 292             proxy_passwd, host = splituser(host)
 293             # now we proceed with the url we want to obtain
 294             urltype, rest = splittype(selector)
 295             url = rest
 296             user_passwd = None
 297             if urltype.lower() != 'http':
 298                 realhost = None
 299             else:
 300                 realhost, rest = splithost(rest)
 301                 if realhost:
 302                     user_passwd, realhost = splituser(realhost)
 303                 if user_passwd:
 304                     selector = "%s://%s%s" % (urltype, realhost, rest)
 305                 if proxy_bypass(realhost):
 306                     host = realhost
 307
 308             #print "proxy via http:", host, selector
 309         if not host: raise IOError, ('http error', 'no host given')
 310
 311         if proxy_passwd:
 312             import base64
 313             proxy_auth = base64.b64encode(proxy_passwd).strip()
 314         else:
 315             proxy_auth = None
 316
 317         if user_passwd:
 318             import base64
 319             auth = base64.b64encode(user_passwd).strip()
 320         else:
 321             auth = None
 322         h = httplib.HTTP(host)
 323         if data is not None:
 324             h.putrequest('POST', selector)
 325             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
 326             h.putheader('Content-Length', '%d' % len(data))
 327         else:
 328             h.putrequest('GET', selector)
 329         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 330         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 331         if realhost: h.putheader('Host', realhost)
 332         for args in self.addheaders: h.putheader(*args)
 333         h.endheaders()
 334         if data is not None:
 335             h.send(data)
 336         errcode, errmsg, headers = h.getreply()
 337         fp = h.getfile()
 338         if errcode == -1:
 339             if fp: fp.close()
 340             # something went wrong with the HTTP status line
 341             raise IOError, ('http protocol error', 0,
 342                             'got a bad status line', None)
 343         # According to RFC 2616, "2xx" code indicates that the client's
 344         # request was successfully received, understood, and accepted.
 345         if not (200 <= errcode < 300):
 346             return addinfourl(fp, headers, "http:" + url)
 347         else:
 348             if data is None:
 349                 return self.http_error(url, fp, errcode, errmsg, headers)
 350             else:
 351                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 352
 353     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 354         """Handle http errors.
 355         Derived class can override this, or provide specific handlers
 356         named http_error_DDD where DDD is the 3-digit error code."""
 357         # First check if there's a specific handler for this error
 358         name = 'http_error_%d' % errcode
 359         if hasattr(self, name):
 360             method = getattr(self, name)
 361             if data is None:
 362                 result = method(url, fp, errcode, errmsg, headers)
 363             else:
 364                 result = method(url, fp, errcode, errmsg, headers, data)
 365             if result: return result
 366         return self.http_error_default(url, fp, errcode, errmsg, headers)
 367
 368     def http_error_default(self, url, fp, errcode, errmsg, headers):
 369         """Default error handler: close the connection and raise IOError."""
 370         void = fp.read()
 371         fp.close()
 372         raise IOError, ('http error', errcode, errmsg, headers)
 373
 374     if _have_ssl:
 375         def open_https(self, url, data=None):
 376             """Use HTTPS protocol."""
 377
 378             import httplib
 379             user_passwd = None
 380             proxy_passwd = None
 381             if isinstance(url, str):
 382                 host, selector = splithost(url)
 383                 if host:
 384                     user_passwd, host = splituser(host)
 385                     host = unquote(host)
 386                 realhost = host
 387             else:
 388                 host, selector = url
 389                 # here, we determine, whether the proxy contains authorization information
 390                 proxy_passwd, host = splituser(host)
 391                 urltype, rest = splittype(selector)
 392                 url = rest
 393                 user_passwd = None
 394                 if urltype.lower() != 'https':
 395                     realhost = None
 396                 else:
 397                     realhost, rest = splithost(rest)
 398                     if realhost:
 399                         user_passwd, realhost = splituser(realhost)
 400                     if user_passwd:
 401                         selector = "%s://%s%s" % (urltype, realhost, rest)
 402                 #print "proxy via https:", host, selector
 403             if not host: raise IOError, ('https error', 'no host given')
 404             if proxy_passwd:
 405                 import base64
 406                 proxy_auth = base64.b64encode(proxy_passwd).strip()
 407             else:
 408                 proxy_auth = None
 409             if user_passwd:
 410                 import base64
 411                 auth = base64.b64encode(user_passwd).strip()
 412             else:
 413                 auth = None
 414             h = httplib.HTTPS(host, 0,
 415                               key_file=self.key_file,
 416                               cert_file=self.cert_file)
 417             if data is not None:
 418                 h.putrequest('POST', selector)
 419                 h.putheader('Content-Type',
 420                             'application/x-www-form-urlencoded')
 421                 h.putheader('Content-Length', '%d' % len(data))
 422             else:
 423                 h.putrequest('GET', selector)
 424             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 425             if auth: h.putheader('Authorization', 'Basic %s' % auth)
 426             if realhost: h.putheader('Host', realhost)
 427             for args in self.addheaders: h.putheader(*args)
 428             h.endheaders()
 429             if data is not None:
 430                 h.send(data)
 431             errcode, errmsg, headers = h.getreply()
 432             fp = h.getfile()
 433             if errcode == -1:
 434                 if fp: fp.close()
 435                 # something went wrong with the HTTP status line
 436                 raise IOError, ('http protocol error', 0,
 437                                 'got a bad status line', None)
 438             # According to RFC 2616, "2xx" code indicates that the client's
 439             # request was successfully received, understood, and accepted.
 440             if not (200 <= errcode < 300):
 441                 return addinfourl(fp, headers, "https:" + url)
 442             else:
 443                 if data is None:
 444                     return self.http_error(url, fp, errcode, errmsg, headers)
 445                 else:
 446                     return self.http_error(url, fp, errcode, errmsg, headers,
 447                                            data)
 448
 449     def open_file(self, url):
 450         """Use local file or FTP depending on form of URL."""
 451         if not isinstance(url, str):
 452             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
 453         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
 454             return self.open_ftp(url)
 455         else:
 456             return self.open_local_file(url)
 457
 458     def open_local_file(self, url):
 459         """Use local file."""
 460         import mimetypes, mimetools, email.utils
 461         try:
 462             from cStringIO import StringIO
 463         except ImportError:
 464             from StringIO import StringIO
 465         host, file = splithost(url)
 466         localname = url2pathname(file)
 467         try:
 468             stats = os.stat(localname)
 469         except OSError, e:
 470             raise IOError(e.errno, e.strerror, e.filename)
 471         size = stats.st_size
 472         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
 473         mtype = mimetypes.guess_type(url)[0]
 474         headers = mimetools.Message(StringIO(
 475             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 476             (mtype or 'text/plain', size, modified)))
 477         if not host:
 478             urlfile = file
 479             if file[:1] == '/':
 480                 urlfile = 'file://' + file
 481             return addinfourl(open(localname, 'rb'),
 482                               headers, urlfile)
 483         host, port = splitport(host)
 484         if not port \
 485            and socket.gethostbyname(host) in (localhost(), thishost()):
 486             urlfile = file
 487             if file[:1] == '/':
 488                 urlfile = 'file://' + file
 489             return addinfourl(open(localname, 'rb'),
 490                               headers, urlfile)
 491         raise IOError, ('local file error', 'not on local host')
 492
 493     def open_ftp(self, url):
 494         """Use FTP protocol."""
 495         if not isinstance(url, str):
 496             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
 497         import mimetypes, mimetools
 498         try:
 499             from cStringIO import StringIO
 500         except ImportError:
 501             from StringIO import StringIO
 502         host, path = splithost(url)
 503         if not host: raise IOError, ('ftp error', 'no host given')
 504         host, port = splitport(host)
 505         user, host = splituser(host)
 506         if user: user, passwd = splitpasswd(user)
 507         else: passwd = None
 508         host = unquote(host)
 509         user = unquote(user or '')
 510         passwd = unquote(passwd or '')
 511         host = socket.gethostbyname(host)
 512         if not port:
 513             import ftplib
 514             port = ftplib.FTP_PORT
 515         else:
 516             port = int(port)
 517         path, attrs = splitattr(path)
 518         path = unquote(path)
 519         dirs = path.split('/')
 520         dirs, file = dirs[:-1], dirs[-1]
 521         if dirs and not dirs[0]: dirs = dirs[1:]
 522         if dirs and not dirs[0]: dirs[0] = '/'
 523         key = user, host, port, '/'.join(dirs)
 524         # XXX thread unsafe!
 525         if len(self.ftpcache) > MAXFTPCACHE:
 526             # Prune the cache, rather arbitrarily
 527             for k in self.ftpcache.keys():
 528                 if k != key:
 529                     v = self.ftpcache[k]
 530                     del self.ftpcache[k]
 531                     v.close()
 532         try:
 533             if not key in self.ftpcache:
 534                 self.ftpcache[key] = \
 535                     ftpwrapper(user, passwd, host, port, dirs)
 536             if not file: type = 'D'
 537             else: type = 'I'
 538             for attr in attrs:
 539                 attr, value = splitvalue(attr)
 540                 if attr.lower() == 'type' and \
 541                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 542                     type = value.upper()
 543             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 544             mtype = mimetypes.guess_type("ftp:" + url)[0]
 545             headers = ""
 546             if mtype:
 547                 headers += "Content-Type: %s\n" % mtype
 548             if retrlen is not None and retrlen >= 0:
 549                 headers += "Content-Length: %d\n" % retrlen
 550             headers = mimetools.Message(StringIO(headers))
 551             return addinfourl(fp, headers, "ftp:" + url)
 552         except ftperrors(), msg:
 553             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 554
 555     def open_data(self, url, data=None):
 556         """Use "data" URL."""
 557         if not isinstance(url, str):
 558             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
 559         # ignore POSTed data
 560         #
 561         # syntax of data URLs:
 562         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 563         # mediatype := [ type "/" subtype ] *( ";" parameter )
 564         # data      := *urlchar
 565         # parameter := attribute "=" value
 566         import mimetools
 567         try:
 568             from cStringIO import StringIO
 569         except ImportError:
 570             from StringIO import StringIO
 571         try:
 572             [type, data] = url.split(',', 1)
 573         except ValueError:
 574             raise IOError, ('data error', 'bad data URL')
 575         if not type:
 576             type = 'text/plain;charset=US-ASCII'
 577         semi = type.rfind(';')
 578         if semi >= 0 and '=' not in type[semi:]:
 579             encoding = type[semi+1:]
 580             type = type[:semi]
 581         else:
 582             encoding = ''
 583         msg = []
 584         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 585                                             time.gmtime(time.time())))
 586         msg.append('Content-type: %s' % type)
 587         if encoding == 'base64':
 588             import base64
 589             data = base64.decodestring(data)
 590         else:
 591             data = unquote(data)
 592         msg.append('Content-Length: %d' % len(data))
 593         msg.append('')
 594         msg.append(data)
 595         msg = '\n'.join(msg)
 596         f = StringIO(msg)
 597         headers = mimetools.Message(f, 0)
 598         #f.fileno = None     # needed for addinfourl
 599         return addinfourl(f, headers, url)
 600
 601
 602 class FancyURLopener(URLopener):
 603     """Derived class with handlers for errors we can handle (perhaps)."""
 604
 605     def __init__(self, *args, **kwargs):
 606         URLopener.__init__(self, *args, **kwargs)
 607         self.auth_cache = {}
 608         self.tries = 0
 609         self.maxtries = 10
 610
 611     def http_error_default(self, url, fp, errcode, errmsg, headers):
 612         """Default error handling -- don't raise an exception."""
 613         return addinfourl(fp, headers, "http:" + url)
 614
 615     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 616         """Error 302 -- relocated (temporarily)."""
 617         self.tries += 1
 618         if self.maxtries and self.tries >= self.maxtries:
 619             if hasattr(self, "http_error_500"):
 620                 meth = self.http_error_500
 621             else:
 622                 meth = self.http_error_default
 623             self.tries = 0
 624             return meth(url, fp, 500,
 625                         "Internal Server Error: Redirect Recursion", headers)
 626         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 627                                         data)
 628         self.tries = 0
 629         return result
 630
 631     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 632         if 'location' in headers:
 633             newurl = headers['location']
 634         elif 'uri' in headers:
 635             newurl = headers['uri']
 636         else:
 637             return
 638         void = fp.read()
 639         fp.close()
 640         # In case the server sent a relative URL, join with original:
 641         newurl = basejoin(self.type + ":" + url, newurl)
 642         return self.open(newurl)
 643
 644     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 645         """Error 301 -- also relocated (permanently)."""
 646         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 647
 648     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
 649         """Error 303 -- also relocated (essentially identical to 302)."""
 650         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 651
 652     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
 653         """Error 307 -- relocated, but turn POST into error."""
 654         if data is None:
 655             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 656         else:
 657             return self.http_error_default(url, fp, errcode, errmsg, headers)
 658
 659     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 660         """Error 401 -- authentication required.
 661         This function supports Basic authentication only."""
 662         if not 'www-authenticate' in headers:
 663             URLopener.http_error_default(self, url, fp,
 664                                          errcode, errmsg, headers)
 665         stuff = headers['www-authenticate']
 666         import re
 667         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 668         if not match:
 669             URLopener.http_error_default(self, url, fp,
 670                                          errcode, errmsg, headers)
 671         scheme, realm = match.groups()
 672         if scheme.lower() != 'basic':
 673             URLopener.http_error_default(self, url, fp,
 674                                          errcode, errmsg, headers)
 675         name = 'retry_' + self.type + '_basic_auth'
 676         if data is None:
 677             return getattr(self,name)(url, realm)
 678         else:
 679             return getattr(self,name)(url, realm, data)
 680
 681     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
 682         """Error 407 -- proxy authentication required.
 683         This function supports Basic authentication only."""
 684         if not 'proxy-authenticate' in headers:
 685             URLopener.http_error_default(self, url, fp,
 686                                          errcode, errmsg, headers)
 687         stuff = headers['proxy-authenticate']
 688         import re
 689         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 690         if not match:
 691             URLopener.http_error_default(self, url, fp,
 692                                          errcode, errmsg, headers)
 693         scheme, realm = match.groups()
 694         if scheme.lower() != 'basic':
 695             URLopener.http_error_default(self, url, fp,
 696                                          errcode, errmsg, headers)
 697         name = 'retry_proxy_' + self.type + '_basic_auth'
 698         if data is None:
 699             return getattr(self,name)(url, realm)
 700         else:
 701             return getattr(self,name)(url, realm, data)
 702
 703     def retry_proxy_http_basic_auth(self, url, realm, data=None):
 704         host, selector = splithost(url)
 705         newurl = 'http://' + host + selector
 706         proxy = self.proxies['http']
 707         urltype, proxyhost = splittype(proxy)
 708         proxyhost, proxyselector = splithost(proxyhost)
 709         i = proxyhost.find('@') + 1
 710         proxyhost = proxyhost[i:]
 711         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 712         if not (user or passwd): return None
 713         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 714         self.proxies['http'] = 'http://' + proxyhost + proxyselector
 715         if data is None:
 716             return self.open(newurl)
 717         else:
 718             return self.open(newurl, data)
 719
 720     def retry_proxy_https_basic_auth(self, url, realm, data=None):
 721         host, selector = splithost(url)
 722         newurl = 'https://' + host + selector
 723         proxy = self.proxies['https']
 724         urltype, proxyhost = splittype(proxy)
 725         proxyhost, proxyselector = splithost(proxyhost)
 726         i = proxyhost.find('@') + 1
 727         proxyhost = proxyhost[i:]
 728         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 729         if not (user or passwd): return None
 730         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 731         self.proxies['https'] = 'https://' + proxyhost + proxyselector
 732         if data is None:
 733             return self.open(newurl)
 734         else:
 735             return self.open(newurl, data)
 736
 737     def retry_http_basic_auth(self, url, realm, data=None):
 738         host, selector = splithost(url)
 739         i = host.find('@') + 1
 740         host = host[i:]
 741         user, passwd = self.get_user_passwd(host, realm, i)
 742         if not (user or passwd): return None
 743         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 744         newurl = 'http://' + host + selector
 745         if data is None:
 746             return self.open(newurl)
 747         else:
 748             return self.open(newurl, data)
 749
 750     def retry_https_basic_auth(self, url, realm, data=None):
 751         host, selector = splithost(url)
 752         i = host.find('@') + 1
 753         host = host[i:]
 754         user, passwd = self.get_user_passwd(host, realm, i)
 755         if not (user or passwd): return None
 756         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 757         newurl = 'https://' + host + selector
 758         if data is None:
 759             return self.open(newurl)
 760         else:
 761             return self.open(newurl, data)
 762
 763     def get_user_passwd(self, host, realm, clear_cache = 0):
 764         key = realm + '@' + host.lower()
 765         if key in self.auth_cache:
 766             if clear_cache:
 767                 del self.auth_cache[key]
 768             else:
 769                 return self.auth_cache[key]
 770         user, passwd = self.prompt_user_passwd(host, realm)
 771         if user or passwd: self.auth_cache[key] = (user, passwd)
 772         return user, passwd
 773
 774     def prompt_user_passwd(self, host, realm):
 775         """Override this in a GUI environment!"""
 776         import getpass
 777         try:
 778             user = raw_input("Enter username for %s at %s: " % (realm,
 779                                                                 host))
 780             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 781                 (user, realm, host))
 782             return user, passwd
 783         except KeyboardInterrupt:
 784             print
 785             return None, None
 786
 787
 788 # Utility functions
 789
 790 _localhost = None
 791 def localhost():
 792     """Return the IP address of the magic hostname 'localhost'."""
 793     global _localhost
 794     if _localhost is None:
 795         _localhost = socket.gethostbyname('localhost')
 796     return _localhost
 797
 798 _thishost = None
 799 def thishost():
 800     """Return the IP address of the current host."""
 801     global _thishost
 802     if _thishost is None:
 803         _thishost = socket.gethostbyname(socket.gethostname())
 804     return _thishost
 805
 806 _ftperrors = None
 807 def ftperrors():
 808     """Return the set of errors raised by the FTP class."""
 809     global _ftperrors
 810     if _ftperrors is None:
 811         import ftplib
 812         _ftperrors = ftplib.all_errors
 813     return _ftperrors
 814
 815 _noheaders = None
 816 def noheaders():
 817     """Return an empty mimetools.Message object."""
 818     global _noheaders
 819     if _noheaders is None:
 820         import mimetools
 821         try:
 822             from cStringIO import StringIO
 823         except ImportError:
 824             from StringIO import StringIO
 825         _noheaders = mimetools.Message(StringIO(), 0)
 826         _noheaders.fp.close()   # Recycle file descriptor
 827     return _noheaders
 828
 829
 830 # Utility classes
 831
 832 class ftpwrapper:
 833     """Class used by open_ftp() for cache of open FTP connections."""
 834
 835     def __init__(self, user, passwd, host, port, dirs, timeout=None):
 836         self.user = user
 837         self.passwd = passwd
 838         self.host = host
 839         self.port = port
 840         self.dirs = dirs
 841         self.timeout = timeout
 842         self.init()
 843
 844     def init(self):
 845         import ftplib
 846         self.busy = 0
 847         self.ftp = ftplib.FTP()
 848         self.ftp.connect(self.host, self.port, self.timeout)
 849         self.ftp.login(self.user, self.passwd)
 850         for dir in self.dirs:
 851             self.ftp.cwd(dir)
 852
 853     def retrfile(self, file, type):
 854         import ftplib
 855         self.endtransfer()
 856         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 857         else: cmd = 'TYPE ' + type; isdir = 0
 858         try:
 859             self.ftp.voidcmd(cmd)
 860         except ftplib.all_errors:
 861             self.init()
 862             self.ftp.voidcmd(cmd)
 863         conn = None
 864         if file and not isdir:
 865             # Try to retrieve as a file
 866             try:
 867                 cmd = 'RETR ' + file
 868                 conn = self.ftp.ntransfercmd(cmd)
 869             except ftplib.error_perm, reason:
 870                 if str(reason)[:3] != '550':
 871                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 872         if not conn:
 873             # Set transfer mode to ASCII!
 874             self.ftp.voidcmd('TYPE A')
 875             # Try a directory listing
 876             if file: cmd = 'LIST ' + file
 877             else: cmd = 'LIST'
 878             conn = self.ftp.ntransfercmd(cmd)
 879         self.busy = 1
 880         # Pass back both a suitably decorated object and a retrieval length
 881         return (addclosehook(conn[0].makefile('rb'),
 882                              self.endtransfer), conn[1])
 883     def endtransfer(self):
 884         if not self.busy:
 885             return
 886         self.busy = 0
 887         try:
 888             self.ftp.voidresp()
 889         except ftperrors():
 890             pass
 891
 892     def close(self):
 893         self.endtransfer()
 894         try:
 895             self.ftp.close()
 896         except ftperrors():
 897             pass
 898
 899 class addbase:
 900     """Base class for addinfo and addclosehook."""
 901
 902     def __init__(self, fp):
 903         self.fp = fp
 904         self.read = self.fp.read
 905         self.readline = self.fp.readline
 906         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 907         if hasattr(self.fp, "fileno"):
 908             self.fileno = self.fp.fileno
 909         else:
 910             self.fileno = lambda: None
 911         if hasattr(self.fp, "__iter__"):
 912             self.__iter__ = self.fp.__iter__
 913             if hasattr(self.fp, "next"):
 914                 self.next = self.fp.next
 915
 916     def __repr__(self):
 917         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
 918                                              id(self), self.fp)
 919
 920     def close(self):
 921         self.read = None
 922         self.readline = None
 923         self.readlines = None
 924         self.fileno = None
 925         if self.fp: self.fp.close()
 926         self.fp = None
 927
 928 class addclosehook(addbase):
 929     """Class to add a close hook to an open file."""
 930
 931     def __init__(self, fp, closehook, *hookargs):
 932         addbase.__init__(self, fp)
 933         self.closehook = closehook
 934         self.hookargs = hookargs
 935
 936     def close(self):
 937         addbase.close(self)
 938         if self.closehook:
 939             self.closehook(*self.hookargs)
 940             self.closehook = None
 941             self.hookargs = None
 942
 943 class addinfo(addbase):
 944     """class to add an info() method to an open file."""
 945
 946     def __init__(self, fp, headers):
 947         addbase.__init__(self, fp)
 948         self.headers = headers
 949
 950     def info(self):
 951         return self.headers
 952
 953 class addinfourl(addbase):
 954     """class to add info() and geturl() methods to an open file."""
 955
 956     def __init__(self, fp, headers, url):
 957         addbase.__init__(self, fp)
 958         self.headers = headers
 959         self.url = url
 960
 961     def info(self):
 962         return self.headers
 963
 964     def geturl(self):
 965         return self.url
 966
 967
 968 # Utilities to parse URLs (most of these return None for missing parts):
 969 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 970 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 971 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 972 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 973 # splitpasswd('user:passwd') -> 'user', 'passwd'
 974 # splitport('host:port') --> 'host', 'port'
 975 # splitquery('/path?query') --> '/path', 'query'
 976 # splittag('/path#tag') --> '/path', 'tag'
 977 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 978 #   '/path', ['attr1=value1', 'attr2=value2', ...]
 979 # splitvalue('attr=value') --> 'attr', 'value'
 980 # unquote('abc%20def') -> 'abc def'
 981 # quote('abc def') -> 'abc%20def')
 982
 983 try:
 984     unicode
 985 except NameError:
 986     def _is_unicode(x):
 987         return 0
 988 else:
 989     def _is_unicode(x):
 990         return isinstance(x, unicode)
 991
 992 def toBytes(url):
 993     """toBytes(u"URL") --> 'URL'."""
 994     # Most URL schemes require ASCII. If that changes, the conversion
 995     # can be relaxed
 996     if _is_unicode(url):
 997         try:
 998             url = url.encode("ASCII")
 999         except UnicodeError:
1000             raise UnicodeError("URL " + repr(url) +
1001                                " contains non-ASCII characters")
1002     return url
1003
1004 def unwrap(url):
1005     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1006     url = url.strip()
1007     if url[:1] == '<' and url[-1:] == '>':
1008         url = url[1:-1].strip()
1009     if url[:4] == 'URL:': url = url[4:].strip()
1010     return url
1011
1012 _typeprog = None
1013 def splittype(url):
1014     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1015     global _typeprog
1016     if _typeprog is None:
1017         import re
1018         _typeprog = re.compile('^([^/:]+):')
1019
1020     match = _typeprog.match(url)
1021     if match:
1022         scheme = match.group(1)
1023         return scheme.lower(), url[len(scheme) + 1:]
1024     return None, url
1025
1026 _hostprog = None
1027 def splithost(url):
1028     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1029     global _hostprog
1030     if _hostprog is None:
1031         import re
1032         _hostprog = re.compile('^//([^/?]*)(.*)$')
1033
1034     match = _hostprog.match(url)
1035     if match: return match.group(1, 2)
1036     return None, url
1037
1038 _userprog = None
1039 def splituser(host):
1040     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1041     global _userprog
1042     if _userprog is None:
1043         import re
1044         _userprog = re.compile('^(.*)@(.*)$')
1045
1046     match = _userprog.match(host)
1047     if match: return map(unquote, match.group(1, 2))
1048     return None, host
1049
1050 _passwdprog = None
1051 def splitpasswd(user):
1052     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1053     global _passwdprog
1054     if _passwdprog is None:
1055         import re
1056         _passwdprog = re.compile('^([^:]*):(.*)$')
1057
1058     match = _passwdprog.match(user)
1059     if match: return match.group(1, 2)
1060     return user, None
1061
1062 # splittag('/path#tag') --> '/path', 'tag'
1063 _portprog = None
1064 def splitport(host):
1065     """splitport('host:port') --> 'host', 'port'."""
1066     global _portprog
1067     if _portprog is None:
1068         import re
1069         _portprog = re.compile('^(.*):([0-9]+)$')
1070
1071     match = _portprog.match(host)
1072     if match: return match.group(1, 2)
1073     return host, None
1074
1075 _nportprog = None
1076 def splitnport(host, defport=-1):
1077     """Split host and port, returning numeric port.
1078     Return given default port if no ':' found; defaults to -1.
1079     Return numerical port if a valid number are found after ':'.
1080     Return None if ':' but not a valid number."""
1081     global _nportprog
1082     if _nportprog is None:
1083         import re
1084         _nportprog = re.compile('^(.*):(.*)$')
1085
1086     match = _nportprog.match(host)
1087     if match:
1088         host, port = match.group(1, 2)
1089         try:
1090             if not port: raise ValueError, "no digits"
1091             nport = int(port)
1092         except ValueError:
1093             nport = None
1094         return host, nport
1095     return host, defport
1096
1097 _queryprog = None
1098 def splitquery(url):
1099     """splitquery('/path?query') --> '/path', 'query'."""
1100     global _queryprog
1101     if _queryprog is None:
1102         import re
1103         _queryprog = re.compile('^(.*)\?([^?]*)$')
1104
1105     match = _queryprog.match(url)
1106     if match: return match.group(1, 2)
1107     return url, None
1108
1109 _tagprog = None
1110 def splittag(url):
1111     """splittag('/path#tag') --> '/path', 'tag'."""
1112     global _tagprog
1113     if _tagprog is None:
1114         import re
1115         _tagprog = re.compile('^(.*)#([^#]*)$')
1116
1117     match = _tagprog.match(url)
1118     if match: return match.group(1, 2)
1119     return url, None
1120
1121 def splitattr(url):
1122     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1123         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1124     words = url.split(';')
1125     return words[0], words[1:]
1126
1127 _valueprog = None
1128 def splitvalue(attr):
1129     """splitvalue('attr=value') --> 'attr', 'value'."""
1130     global _valueprog
1131     if _valueprog is None:
1132         import re
1133         _valueprog = re.compile('^([^=]*)=(.*)$')
1134
1135     match = _valueprog.match(attr)
1136     if match: return match.group(1, 2)
1137     return attr, None
1138
1139 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1140 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1141
1142 def unquote(s):
1143     """unquote('abc%20def') -> 'abc def'."""
1144     res = s.split('%')
1145     for i in xrange(1, len(res)):
1146         item = res[i]
1147         try:
1148             res[i] = _hextochr[item[:2]] + item[2:]
1149         except KeyError:
1150             res[i] = '%' + item
1151         except UnicodeDecodeError:
1152             res[i] = unichr(int(item[:2], 16)) + item[2:]
1153     return "".join(res)
1154
1155 def unquote_plus(s):
1156     """unquote('%7e/abc+def') -> '~/abc def'"""
1157     s = s.replace('+', ' ')
1158     return unquote(s)
1159
1160 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1161                'abcdefghijklmnopqrstuvwxyz'
1162                '0123456789' '_.-')
1163 _safemaps = {}
1164
1165 def quote(s, safe = '/'):
1166     """quote('abc def') -> 'abc%20def'
1167
1168     Each part of a URL, e.g. the path info, the query, etc., has a
1169     different set of reserved characters that must be quoted.
1170
1171     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1172     the following reserved characters.
1173
1174     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1175                   "$" | ","
1176
1177     Each of these characters is reserved in some component of a URL,
1178     but not necessarily in all of them.
1179
1180     By default, the quote function is intended for quoting the path
1181     section of a URL.  Thus, it will not encode '/'.  This character
1182     is reserved, but in typical usage the quote function is being
1183     called on a path where the existing slash characters are used as
1184     reserved characters.
1185     """
1186     cachekey = (safe, always_safe)
1187     try:
1188         safe_map = _safemaps[cachekey]
1189     except KeyError:
1190         safe += always_safe
1191         safe_map = {}
1192         for i in range(256):
1193             c = chr(i)
1194             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1195         _safemaps[cachekey] = safe_map
1196     res = map(safe_map.__getitem__, s)
1197     return ''.join(res)
1198
1199 def quote_plus(s, safe = ''):
1200     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1201     if ' ' in s:
1202         s = quote(s, safe + ' ')
1203         return s.replace(' ', '+')
1204     return quote(s, safe)
1205
1206 def urlencode(query,doseq=0):
1207     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1208
1209     If any values in the query arg are sequences and doseq is true, each
1210     sequence element is converted to a separate parameter.
1211
1212     If the query arg is a sequence of two-element tuples, the order of the
1213     parameters in the output will match the order of parameters in the
1214     input.
1215     """
1216
1217     if hasattr(query,"items"):
1218         # mapping objects
1219         query = query.items()
1220     else:
1221         # it's a bother at times that strings and string-like objects are
1222         # sequences...
1223         try:
1224             # non-sequence items should not work with len()
1225             # non-empty strings will fail this
1226             if len(query) and not isinstance(query[0], tuple):
1227                 raise TypeError
1228             # zero-length sequences of all types will get here and succeed,
1229             # but that's a minor nit - since the original implementation
1230             # allowed empty dicts that type of behavior probably should be
1231             # preserved for consistency
1232         except TypeError:
1233             ty,va,tb = sys.exc_info()
1234             raise TypeError, "not a valid non-string sequence or mapping object", tb
1235
1236     l = []
1237     if not doseq:
1238         # preserve old behavior
1239         for k, v in query:
1240             k = quote_plus(str(k))
1241             v = quote_plus(str(v))
1242             l.append(k + '=' + v)
1243     else:
1244         for k, v in query:
1245             k = quote_plus(str(k))
1246             if isinstance(v, str):
1247                 v = quote_plus(v)
1248                 l.append(k + '=' + v)
1249             elif _is_unicode(v):
1250                 # is there a reasonable way to convert to ASCII?
1251                 # encode generates a string, but "replace" or "ignore"
1252                 # lose information and "strict" can raise UnicodeError
1253                 v = quote_plus(v.encode("ASCII","replace"))
1254                 l.append(k + '=' + v)
1255             else:
1256                 try:
1257                     # is this a sufficient test for sequence-ness?
1258                     x = len(v)
1259                 except TypeError:
1260                     # not a sequence
1261                     v = quote_plus(str(v))
1262                     l.append(k + '=' + v)
1263                 else:
1264                     # loop over the sequence
1265                     for elt in v:
1266                         l.append(k + '=' + quote_plus(str(elt)))
1267     return '&'.join(l)
1268
1269 # Proxy handling
1270 def getproxies_environment():
1271     """Return a dictionary of scheme -> proxy server URL mappings.
1272
1273     Scan the environment for variables named <scheme>_proxy;
1274     this seems to be the standard convention.  If you need a
1275     different way, you can pass a proxies dictionary to the
1276     [Fancy]URLopener constructor.
1277
1278     """
1279     proxies = {}
1280     for name, value in os.environ.items():
1281         name = name.lower()
1282         if value and name[-6:] == '_proxy':
1283             proxies[name[:-6]] = value
1284     return proxies
1285
1286 if sys.platform == 'darwin':
1287     def getproxies_internetconfig():
1288         """Return a dictionary of scheme -> proxy server URL mappings.
1289
1290         By convention the mac uses Internet Config to store
1291         proxies.  An HTTP proxy, for instance, is stored under
1292         the HttpProxy key.
1293
1294         """
1295         try:
1296             import ic
1297         except ImportError:
1298             return {}
1299
1300         try:
1301             config = ic.IC()
1302         except ic.error:
1303             return {}
1304         proxies = {}
1305         # HTTP:
1306         if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1307             try:
1308                 value = config['HTTPProxyHost']
1309             except ic.error:
1310                 pass
1311             else:
1312                 proxies['http'] = 'http://%s' % value
1313         # FTP: XXXX To be done.
1314         # Gopher: XXXX To be done.
1315         return proxies
1316
1317     def proxy_bypass(x):
1318         return 0
1319
1320     def getproxies():
1321         return getproxies_environment() or getproxies_internetconfig()
1322
1323 elif os.name == 'nt':
1324     def getproxies_registry():
1325         """Return a dictionary of scheme -> proxy server URL mappings.
1326
1327         Win32 uses the registry to store proxies.
1328
1329         """
1330         proxies = {}
1331         try:
1332             import _winreg
1333         except ImportError:
1334             # Std module, so should be around - but you never know!
1335             return proxies
1336         try:
1337             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1338                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1339             proxyEnable = _winreg.QueryValueEx(internetSettings,
1340                                                'ProxyEnable')[0]
1341             if proxyEnable:
1342                 # Returned as Unicode but problems if not converted to ASCII
1343                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1344                                                        'ProxyServer')[0])
1345                 if '=' in proxyServer:
1346                     # Per-protocol settings
1347                     for p in proxyServer.split(';'):
1348                         protocol, address = p.split('=', 1)
1349                         # See if address has a type:// prefix
1350                         import re
1351                         if not re.match('^([^/:]+)://', address):
1352                             address = '%s://%s' % (protocol, address)
1353                         proxies[protocol] = address
1354                 else:
1355                     # Use one setting for all protocols
1356                     if proxyServer[:5] == 'http:':
1357                         proxies['http'] = proxyServer
1358                     else:
1359                         proxies['http'] = 'http://%s' % proxyServer
1360                         proxies['ftp'] = 'ftp://%s' % proxyServer
1361             internetSettings.Close()
1362         except (WindowsError, ValueError, TypeError):
1363             # Either registry key not found etc, or the value in an
1364             # unexpected format.
1365             # proxies already set up to be empty so nothing to do
1366             pass
1367         return proxies
1368
1369     def getproxies():
1370         """Return a dictionary of scheme -> proxy server URL mappings.
1371
1372         Returns settings gathered from the environment, if specified,
1373         or the registry.
1374
1375         """
1376         return getproxies_environment() or getproxies_registry()
1377
1378     def proxy_bypass(host):
1379         try:
1380             import _winreg
1381             import re
1382         except ImportError:
1383             # Std modules, so should be around - but you never know!
1384             return 0
1385         try:
1386             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1387                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1388             proxyEnable = _winreg.QueryValueEx(internetSettings,
1389                                                'ProxyEnable')[0]
1390             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1391                                                      'ProxyOverride')[0])
1392             # ^^^^ Returned as Unicode but problems if not converted to ASCII
1393         except WindowsError:
1394             return 0
1395         if not proxyEnable or not proxyOverride:
1396             return 0
1397         # try to make a host list from name and IP address.
1398         rawHost, port = splitport(host)
1399         host = [rawHost]
1400         try:
1401             addr = socket.gethostbyname(rawHost)
1402             if addr != rawHost:
1403                 host.append(addr)
1404         except socket.error:
1405             pass
1406         try:
1407             fqdn = socket.getfqdn(rawHost)
1408             if fqdn != rawHost:
1409                 host.append(fqdn)
1410         except socket.error:
1411             pass
1412         # make a check value list from the registry entry: replace the
1413         # '<local>' string by the localhost entry and the corresponding
1414         # canonical entry.
1415         proxyOverride = proxyOverride.split(';')
1416         i = 0
1417         while i < len(proxyOverride):
1418             if proxyOverride[i] == '<local>':
1419                 proxyOverride[i:i+1] = ['localhost',
1420                                         '127.0.0.1',
1421                                         socket.gethostname(),
1422                                         socket.gethostbyname(
1423                                             socket.gethostname())]
1424             i += 1
1425         # print proxyOverride
1426         # now check if we match one of the registry values.
1427         for test in proxyOverride:
1428             test = test.replace(".", r"\.")     # mask dots
1429             test = test.replace("*", r".*")     # change glob sequence
1430             test = test.replace("?", r".")      # change glob char
1431             for val in host:
1432                 # print "%s <--> %s" %( test, val )
1433                 if re.match(test, val, re.I):
1434                     return 1
1435         return 0
1436
1437 else:
1438     # By default use environment variables
1439     getproxies = getproxies_environment
1440
1441     def proxy_bypass(host):
1442         return 0
1443
1444 # Test and time quote() and unquote()
1445 def test1():
1446     s = ''
1447     for i in range(256): s = s + chr(i)
1448     s = s*4
1449     t0 = time.time()
1450     qs = quote(s)
1451     uqs = unquote(qs)
1452     t1 = time.time()
1453     if uqs != s:
1454         print 'Wrong!'
1455     print repr(s)
1456     print repr(qs)
1457     print repr(uqs)
1458     print round(t1 - t0, 3), 'sec'
1459
1460
1461 def reporthook(blocknum, blocksize, totalsize):
1462     # Report during remote transfers
1463     print "Block number: %d, Block size: %d, Total size: %d" % (
1464         blocknum, blocksize, totalsize)
1465
1466 # Test program
1467 def test(args=[]):
1468     if not args:
1469         args = [
1470             '/etc/passwd',
1471             'file:/etc/passwd',
1472             'file://localhost/etc/passwd',
1473             'ftp://ftp.gnu.org/pub/README',
1474             'http://www.python.org/index.html',
1475             ]
1476         if hasattr(URLopener, "open_https"):
1477             args.append('https://synergy.as.cmu.edu/~geek/')
1478     try:
1479         for url in args:
1480             print '-'*10, url, '-'*10
1481             fn, h = urlretrieve(url, None, reporthook)
1482             print fn
1483             if h:
1484                 print '======'
1485                 for k in h.keys(): print k + ':', h[k]
1486                 print '======'
1487             fp = open(fn, 'rb')
1488             data = fp.read()
1489             del fp
1490             if '\r' in data:
1491                 table = string.maketrans("", "")
1492                 data = data.translate(table, "\r")
1493             print data
1494             fn, h = None, None
1495         print '-'*40
1496     finally:
1497         urlcleanup()
1498
1499 def main():
1500     import getopt, sys
1501     try:
1502         opts, args = getopt.getopt(sys.argv[1:], "th")
1503     except getopt.error, msg:
1504         print msg
1505         print "Use -h for help"
1506         return
1507     t = 0
1508     for o, a in opts:
1509         if o == '-t':
1510             t = t + 1
1511         if o == '-h':
1512             print "Usage: python urllib.py [-t] [url ...]"
1513             print "-t runs self-test;",
1514             print "otherwise, contents of urls are printed"
1515             return
1516     if t:
1517         if t > 1:
1518             test1()
1519         test(args)
1520     else:
1521         if not args:
1522             print "Use -h for help"
1523         for url in args:
1524             print urlopen(url).read(),
1525
1526 # Run test program when run as a script
1527 if __name__ == '__main__':
1528     main()