Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import time
  29 import sys
  30 from urlparse import urljoin as basejoin
  31 import warnings
  32
  33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  34            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  35            "urlencode", "url2pathname", "pathname2url", "splittag",
  36            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  37            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  38            "splitnport", "splitquery", "splitattr", "splitvalue",
  39            "getproxies"]
  40
  41 __version__ = '1.17'    # XXX This version is not always updated :-(
  42
  43 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  44
  45 # Helper for non-unix systems
  46 if os.name == 'mac':
  47     from macurl2path import url2pathname, pathname2url
  48 elif os.name == 'nt':
  49     from nturl2path import url2pathname, pathname2url
  50 elif os.name == 'riscos':
  51     from rourl2path import url2pathname, pathname2url
  52 else:
  53     def url2pathname(pathname):
  54         """OS-specific conversion from a relative URL of the 'file' scheme
  55         to a file system path; not recommended for general use."""
  56         return unquote(pathname)
  57
  58     def pathname2url(pathname):
  59         """OS-specific conversion from a file system path to a relative URL
  60         of the 'file' scheme; not recommended for general use."""
  61         return quote(pathname)
  62
  63 # This really consists of two pieces:
  64 # (1) a class which handles opening of all sorts of URLs
  65 #     (plus assorted utilities etc.)
  66 # (2) a set of functions for parsing URLs
  67 # XXX Should these be separated out into different modules?
  68
  69
  70 # Shortcut for basic usage
  71 _urlopener = None
  72 def urlopen(url, data=None, proxies=None):
  73     """Create a file-like object for the specified URL to read from."""
  74     from warnings import warnpy3k
  75     warnings.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
  76                         "favor of urllib2.urlopen()", stacklevel=2)
  77
  78     global _urlopener
  79     if proxies is not None:
  80         opener = FancyURLopener(proxies=proxies)
  81     elif not _urlopener:
  82         opener = FancyURLopener()
  83         _urlopener = opener
  84     else:
  85         opener = _urlopener
  86     if data is None:
  87         return opener.open(url)
  88     else:
  89         return opener.open(url, data)
  90 def urlretrieve(url, filename=None, reporthook=None, data=None):
  91     global _urlopener
  92     if not _urlopener:
  93         _urlopener = FancyURLopener()
  94     return _urlopener.retrieve(url, filename, reporthook, data)
  95 def urlcleanup():
  96     if _urlopener:
  97         _urlopener.cleanup()
  98
  99 # check for SSL
 100 try:
 101     import ssl
 102 except:
 103     _have_ssl = False
 104 else:
 105     _have_ssl = True
 106
 107 # exception raised when downloaded size does not match content-length
 108 class ContentTooShortError(IOError):
 109     def __init__(self, message, content):
 110         IOError.__init__(self, message)
 111         self.content = content
 112
 113 ftpcache = {}
 114 class URLopener:
 115     """Class to open URLs.
 116     This is a class rather than just a subroutine because we may need
 117     more than one set of global protocol-specific options.
 118     Note -- this is a base class for those who don't want the
 119     automatic handling of errors type 302 (relocated) and 401
 120     (authorization needed)."""
 121
 122     __tempfiles = None
 123
 124     version = "Python-urllib/%s" % __version__
 125
 126     # Constructor
 127     def __init__(self, proxies=None, **x509):
 128         if proxies is None:
 129             proxies = getproxies()
 130         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 131         self.proxies = proxies
 132         self.key_file = x509.get('key_file')
 133         self.cert_file = x509.get('cert_file')
 134         self.addheaders = [('User-Agent', self.version)]
 135         self.__tempfiles = []
 136         self.__unlink = os.unlink # See cleanup()
 137         self.tempcache = None
 138         # Undocumented feature: if you assign {} to tempcache,
 139         # it is used to cache files retrieved with
 140         # self.retrieve().  This is not enabled by default
 141         # since it does not work for changing documents (and I
 142         # haven't got the logic to check expiration headers
 143         # yet).
 144         self.ftpcache = ftpcache
 145         # Undocumented feature: you can use a different
 146         # ftp cache by assigning to the .ftpcache member;
 147         # in case you want logically independent URL openers
 148         # XXX This is not threadsafe.  Bah.
 149
 150     def __del__(self):
 151         self.close()
 152
 153     def close(self):
 154         self.cleanup()
 155
 156     def cleanup(self):
 157         # This code sometimes runs when the rest of this module
 158         # has already been deleted, so it can't use any globals
 159         # or import anything.
 160         if self.__tempfiles:
 161             for file in self.__tempfiles:
 162                 try:
 163                     self.__unlink(file)
 164                 except OSError:
 165                     pass
 166             del self.__tempfiles[:]
 167         if self.tempcache:
 168             self.tempcache.clear()
 169
 170     def addheader(self, *args):
 171         """Add a header to be used by the HTTP interface only
 172         e.g. u.addheader('Accept', 'sound/basic')"""
 173         self.addheaders.append(args)
 174
 175     # External interface
 176     def open(self, fullurl, data=None):
 177         """Use URLopener().open(file) instead of open(file, 'r')."""
 178         fullurl = unwrap(toBytes(fullurl))
 179         # percent encode url, fixing lame server errors for e.g, like space
 180         # within url paths.
 181         fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
 182         if self.tempcache and fullurl in self.tempcache:
 183             filename, headers = self.tempcache[fullurl]
 184             fp = open(filename, 'rb')
 185             return addinfourl(fp, headers, fullurl)
 186         urltype, url = splittype(fullurl)
 187         if not urltype:
 188             urltype = 'file'
 189         if urltype in self.proxies:
 190             proxy = self.proxies[urltype]
 191             urltype, proxyhost = splittype(proxy)
 192             host, selector = splithost(proxyhost)
 193             url = (host, fullurl) # Signal special case to open_*()
 194         else:
 195             proxy = None
 196         name = 'open_' + urltype
 197         self.type = urltype
 198         name = name.replace('-', '_')
 199         if not hasattr(self, name):
 200             if proxy:
 201                 return self.open_unknown_proxy(proxy, fullurl, data)
 202             else:
 203                 return self.open_unknown(fullurl, data)
 204         try:
 205             if data is None:
 206                 return getattr(self, name)(url)
 207             else:
 208                 return getattr(self, name)(url, data)
 209         except socket.error, msg:
 210             raise IOError, ('socket error', msg), sys.exc_info()[2]
 211
 212     def open_unknown(self, fullurl, data=None):
 213         """Overridable interface to open unknown URL type."""
 214         type, url = splittype(fullurl)
 215         raise IOError, ('url error', 'unknown url type', type)
 216
 217     def open_unknown_proxy(self, proxy, fullurl, data=None):
 218         """Overridable interface to open unknown URL type."""
 219         type, url = splittype(fullurl)
 220         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 221
 222     # External interface
 223     def retrieve(self, url, filename=None, reporthook=None, data=None):
 224         """retrieve(url) returns (filename, headers) for a local object
 225         or (tempfilename, headers) for a remote object."""
 226         url = unwrap(toBytes(url))
 227         if self.tempcache and url in self.tempcache:
 228             return self.tempcache[url]
 229         type, url1 = splittype(url)
 230         if filename is None and (not type or type == 'file'):
 231             try:
 232                 fp = self.open_local_file(url1)
 233                 hdrs = fp.info()
 234                 fp.close()
 235                 return url2pathname(splithost(url1)[1]), hdrs
 236             except IOError, msg:
 237                 pass
 238         fp = self.open(url, data)
 239         try:
 240             headers = fp.info()
 241             if filename:
 242                 tfp = open(filename, 'wb')
 243             else:
 244                 import tempfile
 245                 garbage, path = splittype(url)
 246                 garbage, path = splithost(path or "")
 247                 path, garbage = splitquery(path or "")
 248                 path, garbage = splitattr(path or "")
 249                 suffix = os.path.splitext(path)[1]
 250                 (fd, filename) = tempfile.mkstemp(suffix)
 251                 self.__tempfiles.append(filename)
 252                 tfp = os.fdopen(fd, 'wb')
 253             try:
 254                 result = filename, headers
 255                 if self.tempcache is not None:
 256                     self.tempcache[url] = result
 257                 bs = 1024*8
 258                 size = -1
 259                 read = 0
 260                 blocknum = 0
 261                 if reporthook:
 262                     if "content-length" in headers:
 263                         size = int(headers["Content-Length"])
 264                     reporthook(blocknum, bs, size)
 265                 while 1:
 266                     block = fp.read(bs)
 267                     if block == "":
 268                         break
 269                     read += len(block)
 270                     tfp.write(block)
 271                     blocknum += 1
 272                     if reporthook:
 273                         reporthook(blocknum, bs, size)
 274             finally:
 275                 tfp.close()
 276         finally:
 277             fp.close()
 278
 279         # raise exception if actual size does not match content-length header
 280         if size >= 0 and read < size:
 281             raise ContentTooShortError("retrieval incomplete: got only %i out "
 282                                        "of %i bytes" % (read, size), result)
 283
 284         return result
 285
 286     # Each method named open_<type> knows how to open that type of URL
 287
 288     def open_http(self, url, data=None):
 289         """Use HTTP protocol."""
 290         import httplib
 291         user_passwd = None
 292         proxy_passwd= None
 293         if isinstance(url, str):
 294             host, selector = splithost(url)
 295             if host:
 296                 user_passwd, host = splituser(host)
 297                 host = unquote(host)
 298             realhost = host
 299         else:
 300             host, selector = url
 301             # check whether the proxy contains authorization information
 302             proxy_passwd, host = splituser(host)
 303             # now we proceed with the url we want to obtain
 304             urltype, rest = splittype(selector)
 305             url = rest
 306             user_passwd = None
 307             if urltype.lower() != 'http':
 308                 realhost = None
 309             else:
 310                 realhost, rest = splithost(rest)
 311                 if realhost:
 312                     user_passwd, realhost = splituser(realhost)
 313                 if user_passwd:
 314                     selector = "%s://%s%s" % (urltype, realhost, rest)
 315                 if proxy_bypass(realhost):
 316                     host = realhost
 317
 318             #print "proxy via http:", host, selector
 319         if not host: raise IOError, ('http error', 'no host given')
 320
 321         if proxy_passwd:
 322             import base64
 323             proxy_auth = base64.b64encode(proxy_passwd).strip()
 324         else:
 325             proxy_auth = None
 326
 327         if user_passwd:
 328             import base64
 329             auth = base64.b64encode(user_passwd).strip()
 330         else:
 331             auth = None
 332         h = httplib.HTTP(host)
 333         if data is not None:
 334             h.putrequest('POST', selector)
 335             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
 336             h.putheader('Content-Length', '%d' % len(data))
 337         else:
 338             h.putrequest('GET', selector)
 339         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 340         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 341         if realhost: h.putheader('Host', realhost)
 342         for args in self.addheaders: h.putheader(*args)
 343         h.endheaders(data)
 344         errcode, errmsg, headers = h.getreply()
 345         fp = h.getfile()
 346         if errcode == -1:
 347             if fp: fp.close()
 348             # something went wrong with the HTTP status line
 349             raise IOError, ('http protocol error', 0,
 350                             'got a bad status line', None)
 351         # According to RFC 2616, "2xx" code indicates that the client's
 352         # request was successfully received, understood, and accepted.
 353         if (200 <= errcode < 300):
 354             return addinfourl(fp, headers, "http:" + url, errcode)
 355         else:
 356             if data is None:
 357                 return self.http_error(url, fp, errcode, errmsg, headers)
 358             else:
 359                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 360
 361     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 362         """Handle http errors.
 363         Derived class can override this, or provide specific handlers
 364         named http_error_DDD where DDD is the 3-digit error code."""
 365         # First check if there's a specific handler for this error
 366         name = 'http_error_%d' % errcode
 367         if hasattr(self, name):
 368             method = getattr(self, name)
 369             if data is None:
 370                 result = method(url, fp, errcode, errmsg, headers)
 371             else:
 372                 result = method(url, fp, errcode, errmsg, headers, data)
 373             if result: return result
 374         return self.http_error_default(url, fp, errcode, errmsg, headers)
 375
 376     def http_error_default(self, url, fp, errcode, errmsg, headers):
 377         """Default error handler: close the connection and raise IOError."""
 378         void = fp.read()
 379         fp.close()
 380         raise IOError, ('http error', errcode, errmsg, headers)
 381
 382     if _have_ssl:
 383         def open_https(self, url, data=None):
 384             """Use HTTPS protocol."""
 385
 386             import httplib
 387             user_passwd = None
 388             proxy_passwd = None
 389             if isinstance(url, str):
 390                 host, selector = splithost(url)
 391                 if host:
 392                     user_passwd, host = splituser(host)
 393                     host = unquote(host)
 394                 realhost = host
 395             else:
 396                 host, selector = url
 397                 # here, we determine, whether the proxy contains authorization information
 398                 proxy_passwd, host = splituser(host)
 399                 urltype, rest = splittype(selector)
 400                 url = rest
 401                 user_passwd = None
 402                 if urltype.lower() != 'https':
 403                     realhost = None
 404                 else:
 405                     realhost, rest = splithost(rest)
 406                     if realhost:
 407                         user_passwd, realhost = splituser(realhost)
 408                     if user_passwd:
 409                         selector = "%s://%s%s" % (urltype, realhost, rest)
 410                 #print "proxy via https:", host, selector
 411             if not host: raise IOError, ('https error', 'no host given')
 412             if proxy_passwd:
 413                 import base64
 414                 proxy_auth = base64.b64encode(proxy_passwd).strip()
 415             else:
 416                 proxy_auth = None
 417             if user_passwd:
 418                 import base64
 419                 auth = base64.b64encode(user_passwd).strip()
 420             else:
 421                 auth = None
 422             h = httplib.HTTPS(host, 0,
 423                               key_file=self.key_file,
 424                               cert_file=self.cert_file)
 425             if data is not None:
 426                 h.putrequest('POST', selector)
 427                 h.putheader('Content-Type',
 428                             'application/x-www-form-urlencoded')
 429                 h.putheader('Content-Length', '%d' % len(data))
 430             else:
 431                 h.putrequest('GET', selector)
 432             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 433             if auth: h.putheader('Authorization', 'Basic %s' % auth)
 434             if realhost: h.putheader('Host', realhost)
 435             for args in self.addheaders: h.putheader(*args)
 436             h.endheaders(data)
 437             errcode, errmsg, headers = h.getreply()
 438             fp = h.getfile()
 439             if errcode == -1:
 440                 if fp: fp.close()
 441                 # something went wrong with the HTTP status line
 442                 raise IOError, ('http protocol error', 0,
 443                                 'got a bad status line', None)
 444             # According to RFC 2616, "2xx" code indicates that the client's
 445             # request was successfully received, understood, and accepted.
 446             if (200 <= errcode < 300):
 447                 return addinfourl(fp, headers, "https:" + url, errcode)
 448             else:
 449                 if data is None:
 450                     return self.http_error(url, fp, errcode, errmsg, headers)
 451                 else:
 452                     return self.http_error(url, fp, errcode, errmsg, headers,
 453                                            data)
 454
 455     def open_file(self, url):
 456         """Use local file or FTP depending on form of URL."""
 457         if not isinstance(url, str):
 458             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
 459         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
 460             return self.open_ftp(url)
 461         else:
 462             return self.open_local_file(url)
 463
 464     def open_local_file(self, url):
 465         """Use local file."""
 466         import mimetypes, mimetools, email.utils
 467         try:
 468             from cStringIO import StringIO
 469         except ImportError:
 470             from StringIO import StringIO
 471         host, file = splithost(url)
 472         localname = url2pathname(file)
 473         try:
 474             stats = os.stat(localname)
 475         except OSError, e:
 476             raise IOError(e.errno, e.strerror, e.filename)
 477         size = stats.st_size
 478         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
 479         mtype = mimetypes.guess_type(url)[0]
 480         headers = mimetools.Message(StringIO(
 481             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 482             (mtype or 'text/plain', size, modified)))
 483         if not host:
 484             urlfile = file
 485             if file[:1] == '/':
 486                 urlfile = 'file://' + file
 487             return addinfourl(open(localname, 'rb'),
 488                               headers, urlfile)
 489         host, port = splitport(host)
 490         if not port \
 491            and socket.gethostbyname(host) in (localhost(), thishost()):
 492             urlfile = file
 493             if file[:1] == '/':
 494                 urlfile = 'file://' + file
 495             return addinfourl(open(localname, 'rb'),
 496                               headers, urlfile)
 497         raise IOError, ('local file error', 'not on local host')
 498
 499     def open_ftp(self, url):
 500         """Use FTP protocol."""
 501         if not isinstance(url, str):
 502             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
 503         import mimetypes, mimetools
 504         try:
 505             from cStringIO import StringIO
 506         except ImportError:
 507             from StringIO import StringIO
 508         host, path = splithost(url)
 509         if not host: raise IOError, ('ftp error', 'no host given')
 510         host, port = splitport(host)
 511         user, host = splituser(host)
 512         if user: user, passwd = splitpasswd(user)
 513         else: passwd = None
 514         host = unquote(host)
 515         user = unquote(user or '')
 516         passwd = unquote(passwd or '')
 517         host = socket.gethostbyname(host)
 518         if not port:
 519             import ftplib
 520             port = ftplib.FTP_PORT
 521         else:
 522             port = int(port)
 523         path, attrs = splitattr(path)
 524         path = unquote(path)
 525         dirs = path.split('/')
 526         dirs, file = dirs[:-1], dirs[-1]
 527         if dirs and not dirs[0]: dirs = dirs[1:]
 528         if dirs and not dirs[0]: dirs[0] = '/'
 529         key = user, host, port, '/'.join(dirs)
 530         # XXX thread unsafe!
 531         if len(self.ftpcache) > MAXFTPCACHE:
 532             # Prune the cache, rather arbitrarily
 533             for k in self.ftpcache.keys():
 534                 if k != key:
 535                     v = self.ftpcache[k]
 536                     del self.ftpcache[k]
 537                     v.close()
 538         try:
 539             if not key in self.ftpcache:
 540                 self.ftpcache[key] = \
 541                     ftpwrapper(user, passwd, host, port, dirs)
 542             if not file: type = 'D'
 543             else: type = 'I'
 544             for attr in attrs:
 545                 attr, value = splitvalue(attr)
 546                 if attr.lower() == 'type' and \
 547                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 548                     type = value.upper()
 549             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 550             mtype = mimetypes.guess_type("ftp:" + url)[0]
 551             headers = ""
 552             if mtype:
 553                 headers += "Content-Type: %s\n" % mtype
 554             if retrlen is not None and retrlen >= 0:
 555                 headers += "Content-Length: %d\n" % retrlen
 556             headers = mimetools.Message(StringIO(headers))
 557             return addinfourl(fp, headers, "ftp:" + url)
 558         except ftperrors(), msg:
 559             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 560
 561     def open_data(self, url, data=None):
 562         """Use "data" URL."""
 563         if not isinstance(url, str):
 564             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
 565         # ignore POSTed data
 566         #
 567         # syntax of data URLs:
 568         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 569         # mediatype := [ type "/" subtype ] *( ";" parameter )
 570         # data      := *urlchar
 571         # parameter := attribute "=" value
 572         import mimetools
 573         try:
 574             from cStringIO import StringIO
 575         except ImportError:
 576             from StringIO import StringIO
 577         try:
 578             [type, data] = url.split(',', 1)
 579         except ValueError:
 580             raise IOError, ('data error', 'bad data URL')
 581         if not type:
 582             type = 'text/plain;charset=US-ASCII'
 583         semi = type.rfind(';')
 584         if semi >= 0 and '=' not in type[semi:]:
 585             encoding = type[semi+1:]
 586             type = type[:semi]
 587         else:
 588             encoding = ''
 589         msg = []
 590         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 591                                             time.gmtime(time.time())))
 592         msg.append('Content-type: %s' % type)
 593         if encoding == 'base64':
 594             import base64
 595             data = base64.decodestring(data)
 596         else:
 597             data = unquote(data)
 598         msg.append('Content-Length: %d' % len(data))
 599         msg.append('')
 600         msg.append(data)
 601         msg = '\n'.join(msg)
 602         f = StringIO(msg)
 603         headers = mimetools.Message(f, 0)
 604         #f.fileno = None     # needed for addinfourl
 605         return addinfourl(f, headers, url)
 606
 607
 608 class FancyURLopener(URLopener):
 609     """Derived class with handlers for errors we can handle (perhaps)."""
 610
 611     def __init__(self, *args, **kwargs):
 612         URLopener.__init__(self, *args, **kwargs)
 613         self.auth_cache = {}
 614         self.tries = 0
 615         self.maxtries = 10
 616
 617     def http_error_default(self, url, fp, errcode, errmsg, headers):
 618         """Default error handling -- don't raise an exception."""
 619         return addinfourl(fp, headers, "http:" + url, errcode)
 620
 621     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 622         """Error 302 -- relocated (temporarily)."""
 623         self.tries += 1
 624         if self.maxtries and self.tries >= self.maxtries:
 625             if hasattr(self, "http_error_500"):
 626                 meth = self.http_error_500
 627             else:
 628                 meth = self.http_error_default
 629             self.tries = 0
 630             return meth(url, fp, 500,
 631                         "Internal Server Error: Redirect Recursion", headers)
 632         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 633                                         data)
 634         self.tries = 0
 635         return result
 636
 637     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 638         if 'location' in headers:
 639             newurl = headers['location']
 640         elif 'uri' in headers:
 641             newurl = headers['uri']
 642         else:
 643             return
 644         void = fp.read()
 645         fp.close()
 646         # In case the server sent a relative URL, join with original:
 647         newurl = basejoin(self.type + ":" + url, newurl)
 648         return self.open(newurl)
 649
 650     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 651         """Error 301 -- also relocated (permanently)."""
 652         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 653
 654     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
 655         """Error 303 -- also relocated (essentially identical to 302)."""
 656         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 657
 658     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
 659         """Error 307 -- relocated, but turn POST into error."""
 660         if data is None:
 661             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 662         else:
 663             return self.http_error_default(url, fp, errcode, errmsg, headers)
 664
 665     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 666         """Error 401 -- authentication required.
 667         This function supports Basic authentication only."""
 668         if not 'www-authenticate' in headers:
 669             URLopener.http_error_default(self, url, fp,
 670                                          errcode, errmsg, headers)
 671         stuff = headers['www-authenticate']
 672         import re
 673         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 674         if not match:
 675             URLopener.http_error_default(self, url, fp,
 676                                          errcode, errmsg, headers)
 677         scheme, realm = match.groups()
 678         if scheme.lower() != 'basic':
 679             URLopener.http_error_default(self, url, fp,
 680                                          errcode, errmsg, headers)
 681         name = 'retry_' + self.type + '_basic_auth'
 682         if data is None:
 683             return getattr(self,name)(url, realm)
 684         else:
 685             return getattr(self,name)(url, realm, data)
 686
 687     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
 688         """Error 407 -- proxy authentication required.
 689         This function supports Basic authentication only."""
 690         if not 'proxy-authenticate' in headers:
 691             URLopener.http_error_default(self, url, fp,
 692                                          errcode, errmsg, headers)
 693         stuff = headers['proxy-authenticate']
 694         import re
 695         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 696         if not match:
 697             URLopener.http_error_default(self, url, fp,
 698                                          errcode, errmsg, headers)
 699         scheme, realm = match.groups()
 700         if scheme.lower() != 'basic':
 701             URLopener.http_error_default(self, url, fp,
 702                                          errcode, errmsg, headers)
 703         name = 'retry_proxy_' + self.type + '_basic_auth'
 704         if data is None:
 705             return getattr(self,name)(url, realm)
 706         else:
 707             return getattr(self,name)(url, realm, data)
 708
 709     def retry_proxy_http_basic_auth(self, url, realm, data=None):
 710         host, selector = splithost(url)
 711         newurl = 'http://' + host + selector
 712         proxy = self.proxies['http']
 713         urltype, proxyhost = splittype(proxy)
 714         proxyhost, proxyselector = splithost(proxyhost)
 715         i = proxyhost.find('@') + 1
 716         proxyhost = proxyhost[i:]
 717         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 718         if not (user or passwd): return None
 719         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 720         self.proxies['http'] = 'http://' + proxyhost + proxyselector
 721         if data is None:
 722             return self.open(newurl)
 723         else:
 724             return self.open(newurl, data)
 725
 726     def retry_proxy_https_basic_auth(self, url, realm, data=None):
 727         host, selector = splithost(url)
 728         newurl = 'https://' + host + selector
 729         proxy = self.proxies['https']
 730         urltype, proxyhost = splittype(proxy)
 731         proxyhost, proxyselector = splithost(proxyhost)
 732         i = proxyhost.find('@') + 1
 733         proxyhost = proxyhost[i:]
 734         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 735         if not (user or passwd): return None
 736         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 737         self.proxies['https'] = 'https://' + proxyhost + proxyselector
 738         if data is None:
 739             return self.open(newurl)
 740         else:
 741             return self.open(newurl, data)
 742
 743     def retry_http_basic_auth(self, url, realm, data=None):
 744         host, selector = splithost(url)
 745         i = host.find('@') + 1
 746         host = host[i:]
 747         user, passwd = self.get_user_passwd(host, realm, i)
 748         if not (user or passwd): return None
 749         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 750         newurl = 'http://' + host + selector
 751         if data is None:
 752             return self.open(newurl)
 753         else:
 754             return self.open(newurl, data)
 755
 756     def retry_https_basic_auth(self, url, realm, data=None):
 757         host, selector = splithost(url)
 758         i = host.find('@') + 1
 759         host = host[i:]
 760         user, passwd = self.get_user_passwd(host, realm, i)
 761         if not (user or passwd): return None
 762         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 763         newurl = 'https://' + host + selector
 764         if data is None:
 765             return self.open(newurl)
 766         else:
 767             return self.open(newurl, data)
 768
 769     def get_user_passwd(self, host, realm, clear_cache = 0):
 770         key = realm + '@' + host.lower()
 771         if key in self.auth_cache:
 772             if clear_cache:
 773                 del self.auth_cache[key]
 774             else:
 775                 return self.auth_cache[key]
 776         user, passwd = self.prompt_user_passwd(host, realm)
 777         if user or passwd: self.auth_cache[key] = (user, passwd)
 778         return user, passwd
 779
 780     def prompt_user_passwd(self, host, realm):
 781         """Override this in a GUI environment!"""
 782         import getpass
 783         try:
 784             user = raw_input("Enter username for %s at %s: " % (realm,
 785                                                                 host))
 786             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 787                 (user, realm, host))
 788             return user, passwd
 789         except KeyboardInterrupt:
 790             print
 791             return None, None
 792
 793
 794 # Utility functions
 795
 796 _localhost = None
 797 def localhost():
 798     """Return the IP address of the magic hostname 'localhost'."""
 799     global _localhost
 800     if _localhost is None:
 801         _localhost = socket.gethostbyname('localhost')
 802     return _localhost
 803
 804 _thishost = None
 805 def thishost():
 806     """Return the IP address of the current host."""
 807     global _thishost
 808     if _thishost is None:
 809         _thishost = socket.gethostbyname(socket.gethostname())
 810     return _thishost
 811
 812 _ftperrors = None
 813 def ftperrors():
 814     """Return the set of errors raised by the FTP class."""
 815     global _ftperrors
 816     if _ftperrors is None:
 817         import ftplib
 818         _ftperrors = ftplib.all_errors
 819     return _ftperrors
 820
 821 _noheaders = None
 822 def noheaders():
 823     """Return an empty mimetools.Message object."""
 824     global _noheaders
 825     if _noheaders is None:
 826         import mimetools
 827         try:
 828             from cStringIO import StringIO
 829         except ImportError:
 830             from StringIO import StringIO
 831         _noheaders = mimetools.Message(StringIO(), 0)
 832         _noheaders.fp.close()   # Recycle file descriptor
 833     return _noheaders
 834
 835
 836 # Utility classes
 837
 838 class ftpwrapper:
 839     """Class used by open_ftp() for cache of open FTP connections."""
 840
 841     def __init__(self, user, passwd, host, port, dirs,
 842                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 843         self.user = user
 844         self.passwd = passwd
 845         self.host = host
 846         self.port = port
 847         self.dirs = dirs
 848         self.timeout = timeout
 849         self.init()
 850
 851     def init(self):
 852         import ftplib
 853         self.busy = 0
 854         self.ftp = ftplib.FTP()
 855         self.ftp.connect(self.host, self.port, self.timeout)
 856         self.ftp.login(self.user, self.passwd)
 857         for dir in self.dirs:
 858             self.ftp.cwd(dir)
 859
 860     def retrfile(self, file, type):
 861         import ftplib
 862         self.endtransfer()
 863         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 864         else: cmd = 'TYPE ' + type; isdir = 0
 865         try:
 866             self.ftp.voidcmd(cmd)
 867         except ftplib.all_errors:
 868             self.init()
 869             self.ftp.voidcmd(cmd)
 870         conn = None
 871         if file and not isdir:
 872             # Try to retrieve as a file
 873             try:
 874                 cmd = 'RETR ' + file
 875                 conn = self.ftp.ntransfercmd(cmd)
 876             except ftplib.error_perm, reason:
 877                 if str(reason)[:3] != '550':
 878                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 879         if not conn:
 880             # Set transfer mode to ASCII!
 881             self.ftp.voidcmd('TYPE A')
 882             # Try a directory listing. Verify that directory exists.
 883             if file:
 884                 pwd = self.ftp.pwd()
 885                 try:
 886                     try:
 887                         self.ftp.cwd(file)
 888                     except ftplib.error_perm, reason:
 889                         raise IOError, ('ftp error', reason), sys.exc_info()[2]
 890                 finally:
 891                     self.ftp.cwd(pwd)
 892                 cmd = 'LIST ' + file
 893             else:
 894                 cmd = 'LIST'
 895             conn = self.ftp.ntransfercmd(cmd)
 896         self.busy = 1
 897         # Pass back both a suitably decorated object and a retrieval length
 898         return (addclosehook(conn[0].makefile('rb'),
 899                              self.endtransfer), conn[1])
 900     def endtransfer(self):
 901         if not self.busy:
 902             return
 903         self.busy = 0
 904         try:
 905             self.ftp.voidresp()
 906         except ftperrors():
 907             pass
 908
 909     def close(self):
 910         self.endtransfer()
 911         try:
 912             self.ftp.close()
 913         except ftperrors():
 914             pass
 915
 916 class addbase:
 917     """Base class for addinfo and addclosehook."""
 918
 919     def __init__(self, fp):
 920         self.fp = fp
 921         self.read = self.fp.read
 922         self.readline = self.fp.readline
 923         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 924         if hasattr(self.fp, "fileno"):
 925             self.fileno = self.fp.fileno
 926         else:
 927             self.fileno = lambda: None
 928         if hasattr(self.fp, "__iter__"):
 929             self.__iter__ = self.fp.__iter__
 930             if hasattr(self.fp, "next"):
 931                 self.next = self.fp.next
 932
 933     def __repr__(self):
 934         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
 935                                              id(self), self.fp)
 936
 937     def close(self):
 938         self.read = None
 939         self.readline = None
 940         self.readlines = None
 941         self.fileno = None
 942         if self.fp: self.fp.close()
 943         self.fp = None
 944
 945 class addclosehook(addbase):
 946     """Class to add a close hook to an open file."""
 947
 948     def __init__(self, fp, closehook, *hookargs):
 949         addbase.__init__(self, fp)
 950         self.closehook = closehook
 951         self.hookargs = hookargs
 952
 953     def close(self):
 954         addbase.close(self)
 955         if self.closehook:
 956             self.closehook(*self.hookargs)
 957             self.closehook = None
 958             self.hookargs = None
 959
 960 class addinfo(addbase):
 961     """class to add an info() method to an open file."""
 962
 963     def __init__(self, fp, headers):
 964         addbase.__init__(self, fp)
 965         self.headers = headers
 966
 967     def info(self):
 968         return self.headers
 969
 970 class addinfourl(addbase):
 971     """class to add info() and geturl() methods to an open file."""
 972
 973     def __init__(self, fp, headers, url, code=None):
 974         addbase.__init__(self, fp)
 975         self.headers = headers
 976         self.url = url
 977         self.code = code
 978
 979     def info(self):
 980         return self.headers
 981
 982     def getcode(self):
 983         return self.code
 984
 985     def geturl(self):
 986         return self.url
 987
 988
 989 # Utilities to parse URLs (most of these return None for missing parts):
 990 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 991 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 992 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 993 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 994 # splitpasswd('user:passwd') -> 'user', 'passwd'
 995 # splitport('host:port') --> 'host', 'port'
 996 # splitquery('/path?query') --> '/path', 'query'
 997 # splittag('/path#tag') --> '/path', 'tag'
 998 # splitattr('/path;attr1=value1;attr2=value2;...') ->
 999 #   '/path', ['attr1=value1', 'attr2=value2', ...]
1000 # splitvalue('attr=value') --> 'attr', 'value'
1001 # unquote('abc%20def') -> 'abc def'
1002 # quote('abc def') -> 'abc%20def')
1003
1004 try:
1005     unicode
1006 except NameError:
1007     def _is_unicode(x):
1008         return 0
1009 else:
1010     def _is_unicode(x):
1011         return isinstance(x, unicode)
1012
1013 def toBytes(url):
1014     """toBytes(u"URL") --> 'URL'."""
1015     # Most URL schemes require ASCII. If that changes, the conversion
1016     # can be relaxed
1017     if _is_unicode(url):
1018         try:
1019             url = url.encode("ASCII")
1020         except UnicodeError:
1021             raise UnicodeError("URL " + repr(url) +
1022                                " contains non-ASCII characters")
1023     return url
1024
1025 def unwrap(url):
1026     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1027     url = url.strip()
1028     if url[:1] == '<' and url[-1:] == '>':
1029         url = url[1:-1].strip()
1030     if url[:4] == 'URL:': url = url[4:].strip()
1031     return url
1032
1033 _typeprog = None
1034 def splittype(url):
1035     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1036     global _typeprog
1037     if _typeprog is None:
1038         import re
1039         _typeprog = re.compile('^([^/:]+):')
1040
1041     match = _typeprog.match(url)
1042     if match:
1043         scheme = match.group(1)
1044         return scheme.lower(), url[len(scheme) + 1:]
1045     return None, url
1046
1047 _hostprog = None
1048 def splithost(url):
1049     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1050     global _hostprog
1051     if _hostprog is None:
1052         import re
1053         _hostprog = re.compile('^//([^/?]*)(.*)$')
1054
1055     match = _hostprog.match(url)
1056     if match: return match.group(1, 2)
1057     return None, url
1058
1059 _userprog = None
1060 def splituser(host):
1061     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1062     global _userprog
1063     if _userprog is None:
1064         import re
1065         _userprog = re.compile('^(.*)@(.*)$')
1066
1067     match = _userprog.match(host)
1068     if match: return map(unquote, match.group(1, 2))
1069     return None, host
1070
1071 _passwdprog = None
1072 def splitpasswd(user):
1073     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1074     global _passwdprog
1075     if _passwdprog is None:
1076         import re
1077         _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
1078
1079     match = _passwdprog.match(user)
1080     if match: return match.group(1, 2)
1081     return user, None
1082
1083 # splittag('/path#tag') --> '/path', 'tag'
1084 _portprog = None
1085 def splitport(host):
1086     """splitport('host:port') --> 'host', 'port'."""
1087     global _portprog
1088     if _portprog is None:
1089         import re
1090         _portprog = re.compile('^(.*):([0-9]+)$')
1091
1092     match = _portprog.match(host)
1093     if match: return match.group(1, 2)
1094     return host, None
1095
1096 _nportprog = None
1097 def splitnport(host, defport=-1):
1098     """Split host and port, returning numeric port.
1099     Return given default port if no ':' found; defaults to -1.
1100     Return numerical port if a valid number are found after ':'.
1101     Return None if ':' but not a valid number."""
1102     global _nportprog
1103     if _nportprog is None:
1104         import re
1105         _nportprog = re.compile('^(.*):(.*)$')
1106
1107     match = _nportprog.match(host)
1108     if match:
1109         host, port = match.group(1, 2)
1110         try:
1111             if not port: raise ValueError, "no digits"
1112             nport = int(port)
1113         except ValueError:
1114             nport = None
1115         return host, nport
1116     return host, defport
1117
1118 _queryprog = None
1119 def splitquery(url):
1120     """splitquery('/path?query') --> '/path', 'query'."""
1121     global _queryprog
1122     if _queryprog is None:
1123         import re
1124         _queryprog = re.compile('^(.*)\?([^?]*)$')
1125
1126     match = _queryprog.match(url)
1127     if match: return match.group(1, 2)
1128     return url, None
1129
1130 _tagprog = None
1131 def splittag(url):
1132     """splittag('/path#tag') --> '/path', 'tag'."""
1133     global _tagprog
1134     if _tagprog is None:
1135         import re
1136         _tagprog = re.compile('^(.*)#([^#]*)$')
1137
1138     match = _tagprog.match(url)
1139     if match: return match.group(1, 2)
1140     return url, None
1141
1142 def splitattr(url):
1143     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1144         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1145     words = url.split(';')
1146     return words[0], words[1:]
1147
1148 _valueprog = None
1149 def splitvalue(attr):
1150     """splitvalue('attr=value') --> 'attr', 'value'."""
1151     global _valueprog
1152     if _valueprog is None:
1153         import re
1154         _valueprog = re.compile('^([^=]*)=(.*)$')
1155
1156     match = _valueprog.match(attr)
1157     if match: return match.group(1, 2)
1158     return attr, None
1159
1160 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1161 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1162
1163 def unquote(s):
1164     """unquote('abc%20def') -> 'abc def'."""
1165     res = s.split('%')
1166     for i in xrange(1, len(res)):
1167         item = res[i]
1168         try:
1169             res[i] = _hextochr[item[:2]] + item[2:]
1170         except KeyError:
1171             res[i] = '%' + item
1172         except UnicodeDecodeError:
1173             res[i] = unichr(int(item[:2], 16)) + item[2:]
1174     return "".join(res)
1175
1176 def unquote_plus(s):
1177     """unquote('%7e/abc+def') -> '~/abc def'"""
1178     s = s.replace('+', ' ')
1179     return unquote(s)
1180
1181 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1182                'abcdefghijklmnopqrstuvwxyz'
1183                '0123456789' '_.-')
1184 _safemaps = {}
1185
1186 def quote(s, safe = '/'):
1187     """quote('abc def') -> 'abc%20def'
1188
1189     Each part of a URL, e.g. the path info, the query, etc., has a
1190     different set of reserved characters that must be quoted.
1191
1192     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1193     the following reserved characters.
1194
1195     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1196                   "$" | ","
1197
1198     Each of these characters is reserved in some component of a URL,
1199     but not necessarily in all of them.
1200
1201     By default, the quote function is intended for quoting the path
1202     section of a URL.  Thus, it will not encode '/'.  This character
1203     is reserved, but in typical usage the quote function is being
1204     called on a path where the existing slash characters are used as
1205     reserved characters.
1206     """
1207     cachekey = (safe, always_safe)
1208     try:
1209         safe_map = _safemaps[cachekey]
1210     except KeyError:
1211         safe += always_safe
1212         safe_map = {}
1213         for i in range(256):
1214             c = chr(i)
1215             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1216         _safemaps[cachekey] = safe_map
1217     res = map(safe_map.__getitem__, s)
1218     return ''.join(res)
1219
1220 def quote_plus(s, safe = ''):
1221     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1222     if ' ' in s:
1223         s = quote(s, safe + ' ')
1224         return s.replace(' ', '+')
1225     return quote(s, safe)
1226
1227 def urlencode(query,doseq=0):
1228     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1229
1230     If any values in the query arg are sequences and doseq is true, each
1231     sequence element is converted to a separate parameter.
1232
1233     If the query arg is a sequence of two-element tuples, the order of the
1234     parameters in the output will match the order of parameters in the
1235     input.
1236     """
1237
1238     if hasattr(query,"items"):
1239         # mapping objects
1240         query = query.items()
1241     else:
1242         # it's a bother at times that strings and string-like objects are
1243         # sequences...
1244         try:
1245             # non-sequence items should not work with len()
1246             # non-empty strings will fail this
1247             if len(query) and not isinstance(query[0], tuple):
1248                 raise TypeError
1249             # zero-length sequences of all types will get here and succeed,
1250             # but that's a minor nit - since the original implementation
1251             # allowed empty dicts that type of behavior probably should be
1252             # preserved for consistency
1253         except TypeError:
1254             ty,va,tb = sys.exc_info()
1255             raise TypeError, "not a valid non-string sequence or mapping object", tb
1256
1257     l = []
1258     if not doseq:
1259         # preserve old behavior
1260         for k, v in query:
1261             k = quote_plus(str(k))
1262             v = quote_plus(str(v))
1263             l.append(k + '=' + v)
1264     else:
1265         for k, v in query:
1266             k = quote_plus(str(k))
1267             if isinstance(v, str):
1268                 v = quote_plus(v)
1269                 l.append(k + '=' + v)
1270             elif _is_unicode(v):
1271                 # is there a reasonable way to convert to ASCII?
1272                 # encode generates a string, but "replace" or "ignore"
1273                 # lose information and "strict" can raise UnicodeError
1274                 v = quote_plus(v.encode("ASCII","replace"))
1275                 l.append(k + '=' + v)
1276             else:
1277                 try:
1278                     # is this a sufficient test for sequence-ness?
1279                     x = len(v)
1280                 except TypeError:
1281                     # not a sequence
1282                     v = quote_plus(str(v))
1283                     l.append(k + '=' + v)
1284                 else:
1285                     # loop over the sequence
1286                     for elt in v:
1287                         l.append(k + '=' + quote_plus(str(elt)))
1288     return '&'.join(l)
1289
1290 # Proxy handling
1291 def getproxies_environment():
1292     """Return a dictionary of scheme -> proxy server URL mappings.
1293
1294     Scan the environment for variables named <scheme>_proxy;
1295     this seems to be the standard convention.  If you need a
1296     different way, you can pass a proxies dictionary to the
1297     [Fancy]URLopener constructor.
1298
1299     """
1300     proxies = {}
1301     for name, value in os.environ.items():
1302         name = name.lower()
1303         if value and name[-6:] == '_proxy':
1304             proxies[name[:-6]] = value
1305     return proxies
1306
1307 def proxy_bypass_environment(host):
1308     """Test if proxies should not be used for a particular host.
1309
1310     Checks the environment for a variable named no_proxy, which should
1311     be a list of DNS suffixes separated by commas, or '*' for all hosts.
1312     """
1313     no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1314     # '*' is special case for always bypass
1315     if no_proxy == '*':
1316         return 1
1317     # strip port off host
1318     hostonly, port = splitport(host)
1319     # check if the host ends with any of the DNS suffixes
1320     for name in no_proxy.split(','):
1321         if name and (hostonly.endswith(name) or host.endswith(name)):
1322             return 1
1323     # otherwise, don't bypass
1324     return 0
1325
1326
1327 if sys.platform == 'darwin':
1328     from _scproxy import _get_proxy_settings, _get_proxies
1329
1330     def proxy_bypass_macosx_sysconf(host):
1331         """
1332         Return True iff this host shouldn't be accessed using a proxy
1333
1334         This function uses the MacOSX framework SystemConfiguration
1335         to fetch the proxy information.
1336         """
1337         import re
1338         import socket
1339         from fnmatch import fnmatch
1340
1341         hostonly, port = splitport(host)
1342
1343         def ip2num(ipAddr):
1344             parts = ipAddr.split('.')
1345             parts = map(int, parts)
1346             if len(parts) != 4:
1347                 parts = (parts + [0, 0, 0, 0])[:4]
1348             return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1349
1350         proxy_settings = _get_proxy_settings()
1351
1352         # Check for simple host names:
1353         if '.' not in host:
1354             if proxy_settings['exclude_simple']:
1355                 return True
1356
1357         hostIP = None
1358
1359         for value in proxy_settings.get('exceptions', ()):
1360             # Items in the list are strings like these: *.local, 169.254/16
1361             if not value: continue
1362
1363             m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1364             if m is not None:
1365                 if hostIP is None:
1366                     try:
1367                         hostIP = socket.gethostbyname(hostonly)
1368                         hostIP = ip2num(hostIP)
1369                     except socket.error:
1370                         continue
1371
1372                 base = ip2num(m.group(1))
1373                 mask = int(m.group(2)[1:])
1374                 mask = 32 - mask
1375
1376                 if (hostIP >> mask) == (base >> mask):
1377                     return True
1378
1379             elif fnmatch(host, value):
1380                 return True
1381
1382         return False
1383
1384
1385     def getproxies_macosx_sysconf():
1386         """Return a dictionary of scheme -> proxy server URL mappings.
1387
1388         This function uses the MacOSX framework SystemConfiguration
1389         to fetch the proxy information.
1390         """
1391         return _get_proxies()
1392
1393
1394
1395     def proxy_bypass(host):
1396         if getproxies_environment():
1397             return proxy_bypass_environment(host)
1398         else:
1399             return proxy_bypass_macosx_sysconf(host)
1400
1401     def getproxies():
1402         return getproxies_environment() or getproxies_macosx_sysconf()
1403
1404 elif os.name == 'nt':
1405     def getproxies_registry():
1406         """Return a dictionary of scheme -> proxy server URL mappings.
1407
1408         Win32 uses the registry to store proxies.
1409
1410         """
1411         proxies = {}
1412         try:
1413             import _winreg
1414         except ImportError:
1415             # Std module, so should be around - but you never know!
1416             return proxies
1417         try:
1418             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1419                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1420             proxyEnable = _winreg.QueryValueEx(internetSettings,
1421                                                'ProxyEnable')[0]
1422             if proxyEnable:
1423                 # Returned as Unicode but problems if not converted to ASCII
1424                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1425                                                        'ProxyServer')[0])
1426                 if '=' in proxyServer:
1427                     # Per-protocol settings
1428                     for p in proxyServer.split(';'):
1429                         protocol, address = p.split('=', 1)
1430                         # See if address has a type:// prefix
1431                         import re
1432                         if not re.match('^([^/:]+)://', address):
1433                             address = '%s://%s' % (protocol, address)
1434                         proxies[protocol] = address
1435                 else:
1436                     # Use one setting for all protocols
1437                     if proxyServer[:5] == 'http:':
1438                         proxies['http'] = proxyServer
1439                     else:
1440                         proxies['http'] = 'http://%s' % proxyServer
1441                         proxies['ftp'] = 'ftp://%s' % proxyServer
1442             internetSettings.Close()
1443         except (WindowsError, ValueError, TypeError):
1444             # Either registry key not found etc, or the value in an
1445             # unexpected format.
1446             # proxies already set up to be empty so nothing to do
1447             pass
1448         return proxies
1449
1450     def getproxies():
1451         """Return a dictionary of scheme -> proxy server URL mappings.
1452
1453         Returns settings gathered from the environment, if specified,
1454         or the registry.
1455
1456         """
1457         return getproxies_environment() or getproxies_registry()
1458
1459     def proxy_bypass_registry(host):
1460         try:
1461             import _winreg
1462             import re
1463         except ImportError:
1464             # Std modules, so should be around - but you never know!
1465             return 0
1466         try:
1467             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1468                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1469             proxyEnable = _winreg.QueryValueEx(internetSettings,
1470                                                'ProxyEnable')[0]
1471             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1472                                                      'ProxyOverride')[0])
1473             # ^^^^ Returned as Unicode but problems if not converted to ASCII
1474         except WindowsError:
1475             return 0
1476         if not proxyEnable or not proxyOverride:
1477             return 0
1478         # try to make a host list from name and IP address.
1479         rawHost, port = splitport(host)
1480         host = [rawHost]
1481         try:
1482             addr = socket.gethostbyname(rawHost)
1483             if addr != rawHost:
1484                 host.append(addr)
1485         except socket.error:
1486             pass
1487         try:
1488             fqdn = socket.getfqdn(rawHost)
1489             if fqdn != rawHost:
1490                 host.append(fqdn)
1491         except socket.error:
1492             pass
1493         # make a check value list from the registry entry: replace the
1494         # '<local>' string by the localhost entry and the corresponding
1495         # canonical entry.
1496         proxyOverride = proxyOverride.split(';')
1497         # now check if we match one of the registry values.
1498         for test in proxyOverride:
1499             if test == '<local>':
1500                 if '.' not in rawHost:
1501                     return 1
1502             test = test.replace(".", r"\.")     # mask dots
1503             test = test.replace("*", r".*")     # change glob sequence
1504             test = test.replace("?", r".")      # change glob char
1505             for val in host:
1506                 # print "%s <--> %s" %( test, val )
1507                 if re.match(test, val, re.I):
1508                     return 1
1509         return 0
1510
1511     def proxy_bypass(host):
1512         """Return a dictionary of scheme -> proxy server URL mappings.
1513
1514         Returns settings gathered from the environment, if specified,
1515         or the registry.
1516
1517         """
1518         if getproxies_environment():
1519             return proxy_bypass_environment(host)
1520         else:
1521             return proxy_bypass_registry(host)
1522
1523 else:
1524     # By default use environment variables
1525     getproxies = getproxies_environment
1526     proxy_bypass = proxy_bypass_environment
1527
1528 # Test and time quote() and unquote()
1529 def test1():
1530     s = ''
1531     for i in range(256): s = s + chr(i)
1532     s = s*4
1533     t0 = time.time()
1534     qs = quote(s)
1535     uqs = unquote(qs)
1536     t1 = time.time()
1537     if uqs != s:
1538         print 'Wrong!'
1539     print repr(s)
1540     print repr(qs)
1541     print repr(uqs)
1542     print round(t1 - t0, 3), 'sec'
1543
1544
1545 def reporthook(blocknum, blocksize, totalsize):
1546     # Report during remote transfers
1547     print "Block number: %d, Block size: %d, Total size: %d" % (
1548         blocknum, blocksize, totalsize)
1549
1550 # Test program
1551 def test(args=[]):
1552     if not args:
1553         args = [
1554             '/etc/passwd',
1555             'file:/etc/passwd',
1556             'file://localhost/etc/passwd',
1557             'ftp://ftp.gnu.org/pub/README',
1558             'http://www.python.org/index.html',
1559             ]
1560         if hasattr(URLopener, "open_https"):
1561             args.append('https://synergy.as.cmu.edu/~geek/')
1562     try:
1563         for url in args:
1564             print '-'*10, url, '-'*10
1565             fn, h = urlretrieve(url, None, reporthook)
1566             print fn
1567             if h:
1568                 print '======'
1569                 for k in h.keys(): print k + ':', h[k]
1570                 print '======'
1571             with open(fn, 'rb') as fp:
1572                 data = fp.read()
1573             if '\r' in data:
1574                 table = string.maketrans("", "")
1575                 data = data.translate(table, "\r")
1576             print data
1577             fn, h = None, None
1578         print '-'*40
1579     finally:
1580         urlcleanup()
1581
1582 def main():
1583     import getopt, sys
1584     try:
1585         opts, args = getopt.getopt(sys.argv[1:], "th")
1586     except getopt.error, msg:
1587         print msg
1588         print "Use -h for help"
1589         return
1590     t = 0
1591     for o, a in opts:
1592         if o == '-t':
1593             t = t + 1
1594         if o == '-h':
1595             print "Usage: python urllib.py [-t] [url ...]"
1596             print "-t runs self-test;",
1597             print "otherwise, contents of urls are printed"
1598             return
1599     if t:
1600         if t > 1:
1601             test1()
1602         test(args)
1603     else:
1604         if not args:
1605             print "Use -h for help"
1606         for url in args:
1607             print urlopen(url).read(),
1608
1609 # Run test program when run as a script
1610 if __name__ == '__main__':
1611     main()