Lib/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 The object returned by URLopener().open(file) will differ per
  17 protocol.  All you know is that is has methods read(), readline(),
  18 readlines(), fileno(), close() and info().  The read*(), fileno()
  19 and close() methods work like those of open files.
  20 The info() method returns a mimetools.Message object which can be
  21 used to query various info about the object, if available.
  22 (mimetools.Message objects are queried with the getheader() method.)
  23 """
  24
  25 import string
  26 import socket
  27 import os
  28 import time
  29 import sys
  30 from urlparse import urljoin as basejoin
  31 import warnings
  32
  33 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
  34            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
  35            "urlencode", "url2pathname", "pathname2url", "splittag",
  36            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
  37            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  38            "splitnport", "splitquery", "splitattr", "splitvalue",
  39            "getproxies"]
  40
  41 __version__ = '1.17'    # XXX This version is not always updated :-(
  42
  43 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
  44
  45 # Helper for non-unix systems
  46 if os.name == 'mac':
  47     from macurl2path import url2pathname, pathname2url
  48 elif os.name == 'nt':
  49     from nturl2path import url2pathname, pathname2url
  50 elif os.name == 'riscos':
  51     from rourl2path import url2pathname, pathname2url
  52 else:
  53     def url2pathname(pathname):
  54         """OS-specific conversion from a relative URL of the 'file' scheme
  55         to a file system path; not recommended for general use."""
  56         return unquote(pathname)
  57
  58     def pathname2url(pathname):
  59         """OS-specific conversion from a file system path to a relative URL
  60         of the 'file' scheme; not recommended for general use."""
  61         return quote(pathname)
  62
  63 # This really consists of two pieces:
  64 # (1) a class which handles opening of all sorts of URLs
  65 #     (plus assorted utilities etc.)
  66 # (2) a set of functions for parsing URLs
  67 # XXX Should these be separated out into different modules?
  68
  69
  70 # Shortcut for basic usage
  71 _urlopener = None
  72 def urlopen(url, data=None, proxies=None):
  73     """Create a file-like object for the specified URL to read from."""
  74     from warnings import warnpy3k
  75     warnings.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
  76                         "favor of urllib2.urlopen()", stacklevel=2)
  77
  78     global _urlopener
  79     if proxies is not None:
  80         opener = FancyURLopener(proxies=proxies)
  81     elif not _urlopener:
  82         opener = FancyURLopener()
  83         _urlopener = opener
  84     else:
  85         opener = _urlopener
  86     if data is None:
  87         return opener.open(url)
  88     else:
  89         return opener.open(url, data)
  90 def urlretrieve(url, filename=None, reporthook=None, data=None):
  91     global _urlopener
  92     if not _urlopener:
  93         _urlopener = FancyURLopener()
  94     return _urlopener.retrieve(url, filename, reporthook, data)
  95 def urlcleanup():
  96     if _urlopener:
  97         _urlopener.cleanup()
  98     _safemaps.clear()
  99     ftpcache.clear()
 100
 101 # check for SSL
 102 try:
 103     import ssl
 104 except:
 105     _have_ssl = False
 106 else:
 107     _have_ssl = True
 108
 109 # exception raised when downloaded size does not match content-length
 110 class ContentTooShortError(IOError):
 111     def __init__(self, message, content):
 112         IOError.__init__(self, message)
 113         self.content = content
 114
 115 ftpcache = {}
 116 class URLopener:
 117     """Class to open URLs.
 118     This is a class rather than just a subroutine because we may need
 119     more than one set of global protocol-specific options.
 120     Note -- this is a base class for those who don't want the
 121     automatic handling of errors type 302 (relocated) and 401
 122     (authorization needed)."""
 123
 124     __tempfiles = None
 125
 126     version = "Python-urllib/%s" % __version__
 127
 128     # Constructor
 129     def __init__(self, proxies=None, **x509):
 130         if proxies is None:
 131             proxies = getproxies()
 132         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
 133         self.proxies = proxies
 134         self.key_file = x509.get('key_file')
 135         self.cert_file = x509.get('cert_file')
 136         self.addheaders = [('User-Agent', self.version)]
 137         self.__tempfiles = []
 138         self.__unlink = os.unlink # See cleanup()
 139         self.tempcache = None
 140         # Undocumented feature: if you assign {} to tempcache,
 141         # it is used to cache files retrieved with
 142         # self.retrieve().  This is not enabled by default
 143         # since it does not work for changing documents (and I
 144         # haven't got the logic to check expiration headers
 145         # yet).
 146         self.ftpcache = ftpcache
 147         # Undocumented feature: you can use a different
 148         # ftp cache by assigning to the .ftpcache member;
 149         # in case you want logically independent URL openers
 150         # XXX This is not threadsafe.  Bah.
 151
 152     def __del__(self):
 153         self.close()
 154
 155     def close(self):
 156         self.cleanup()
 157
 158     def cleanup(self):
 159         # This code sometimes runs when the rest of this module
 160         # has already been deleted, so it can't use any globals
 161         # or import anything.
 162         if self.__tempfiles:
 163             for file in self.__tempfiles:
 164                 try:
 165                     self.__unlink(file)
 166                 except OSError:
 167                     pass
 168             del self.__tempfiles[:]
 169         if self.tempcache:
 170             self.tempcache.clear()
 171
 172     def addheader(self, *args):
 173         """Add a header to be used by the HTTP interface only
 174         e.g. u.addheader('Accept', 'sound/basic')"""
 175         self.addheaders.append(args)
 176
 177     # External interface
 178     def open(self, fullurl, data=None):
 179         """Use URLopener().open(file) instead of open(file, 'r')."""
 180         fullurl = unwrap(toBytes(fullurl))
 181         # percent encode url, fixing lame server errors for e.g, like space
 182         # within url paths.
 183         fullurl = quote(fullurl, safe="%/:=&?~#+!$,;'@()*[]")
 184         if self.tempcache and fullurl in self.tempcache:
 185             filename, headers = self.tempcache[fullurl]
 186             fp = open(filename, 'rb')
 187             return addinfourl(fp, headers, fullurl)
 188         urltype, url = splittype(fullurl)
 189         if not urltype:
 190             urltype = 'file'
 191         if urltype in self.proxies:
 192             proxy = self.proxies[urltype]
 193             urltype, proxyhost = splittype(proxy)
 194             host, selector = splithost(proxyhost)
 195             url = (host, fullurl) # Signal special case to open_*()
 196         else:
 197             proxy = None
 198         name = 'open_' + urltype
 199         self.type = urltype
 200         name = name.replace('-', '_')
 201         if not hasattr(self, name):
 202             if proxy:
 203                 return self.open_unknown_proxy(proxy, fullurl, data)
 204             else:
 205                 return self.open_unknown(fullurl, data)
 206         try:
 207             if data is None:
 208                 return getattr(self, name)(url)
 209             else:
 210                 return getattr(self, name)(url, data)
 211         except socket.error, msg:
 212             raise IOError, ('socket error', msg), sys.exc_info()[2]
 213
 214     def open_unknown(self, fullurl, data=None):
 215         """Overridable interface to open unknown URL type."""
 216         type, url = splittype(fullurl)
 217         raise IOError, ('url error', 'unknown url type', type)
 218
 219     def open_unknown_proxy(self, proxy, fullurl, data=None):
 220         """Overridable interface to open unknown URL type."""
 221         type, url = splittype(fullurl)
 222         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
 223
 224     # External interface
 225     def retrieve(self, url, filename=None, reporthook=None, data=None):
 226         """retrieve(url) returns (filename, headers) for a local object
 227         or (tempfilename, headers) for a remote object."""
 228         url = unwrap(toBytes(url))
 229         if self.tempcache and url in self.tempcache:
 230             return self.tempcache[url]
 231         type, url1 = splittype(url)
 232         if filename is None and (not type or type == 'file'):
 233             try:
 234                 fp = self.open_local_file(url1)
 235                 hdrs = fp.info()
 236                 fp.close()
 237                 return url2pathname(splithost(url1)[1]), hdrs
 238             except IOError, msg:
 239                 pass
 240         fp = self.open(url, data)
 241         try:
 242             headers = fp.info()
 243             if filename:
 244                 tfp = open(filename, 'wb')
 245             else:
 246                 import tempfile
 247                 garbage, path = splittype(url)
 248                 garbage, path = splithost(path or "")
 249                 path, garbage = splitquery(path or "")
 250                 path, garbage = splitattr(path or "")
 251                 suffix = os.path.splitext(path)[1]
 252                 (fd, filename) = tempfile.mkstemp(suffix)
 253                 self.__tempfiles.append(filename)
 254                 tfp = os.fdopen(fd, 'wb')
 255             try:
 256                 result = filename, headers
 257                 if self.tempcache is not None:
 258                     self.tempcache[url] = result
 259                 bs = 1024*8
 260                 size = -1
 261                 read = 0
 262                 blocknum = 0
 263                 if reporthook:
 264                     if "content-length" in headers:
 265                         size = int(headers["Content-Length"])
 266                     reporthook(blocknum, bs, size)
 267                 while 1:
 268                     block = fp.read(bs)
 269                     if block == "":
 270                         break
 271                     read += len(block)
 272                     tfp.write(block)
 273                     blocknum += 1
 274                     if reporthook:
 275                         reporthook(blocknum, bs, size)
 276             finally:
 277                 tfp.close()
 278         finally:
 279             fp.close()
 280
 281         # raise exception if actual size does not match content-length header
 282         if size >= 0 and read < size:
 283             raise ContentTooShortError("retrieval incomplete: got only %i out "
 284                                        "of %i bytes" % (read, size), result)
 285
 286         return result
 287
 288     # Each method named open_<type> knows how to open that type of URL
 289
 290     def open_http(self, url, data=None):
 291         """Use HTTP protocol."""
 292         import httplib
 293         user_passwd = None
 294         proxy_passwd= None
 295         if isinstance(url, str):
 296             host, selector = splithost(url)
 297             if host:
 298                 user_passwd, host = splituser(host)
 299                 host = unquote(host)
 300             realhost = host
 301         else:
 302             host, selector = url
 303             # check whether the proxy contains authorization information
 304             proxy_passwd, host = splituser(host)
 305             # now we proceed with the url we want to obtain
 306             urltype, rest = splittype(selector)
 307             url = rest
 308             user_passwd = None
 309             if urltype.lower() != 'http':
 310                 realhost = None
 311             else:
 312                 realhost, rest = splithost(rest)
 313                 if realhost:
 314                     user_passwd, realhost = splituser(realhost)
 315                 if user_passwd:
 316                     selector = "%s://%s%s" % (urltype, realhost, rest)
 317                 if proxy_bypass(realhost):
 318                     host = realhost
 319
 320             #print "proxy via http:", host, selector
 321         if not host: raise IOError, ('http error', 'no host given')
 322
 323         if proxy_passwd:
 324             import base64
 325             proxy_auth = base64.b64encode(proxy_passwd).strip()
 326         else:
 327             proxy_auth = None
 328
 329         if user_passwd:
 330             import base64
 331             auth = base64.b64encode(user_passwd).strip()
 332         else:
 333             auth = None
 334         h = httplib.HTTP(host)
 335         if data is not None:
 336             h.putrequest('POST', selector)
 337             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
 338             h.putheader('Content-Length', '%d' % len(data))
 339         else:
 340             h.putrequest('GET', selector)
 341         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 342         if auth: h.putheader('Authorization', 'Basic %s' % auth)
 343         if realhost: h.putheader('Host', realhost)
 344         for args in self.addheaders: h.putheader(*args)
 345         h.endheaders(data)
 346         errcode, errmsg, headers = h.getreply()
 347         fp = h.getfile()
 348         if errcode == -1:
 349             if fp: fp.close()
 350             # something went wrong with the HTTP status line
 351             raise IOError, ('http protocol error', 0,
 352                             'got a bad status line', None)
 353         # According to RFC 2616, "2xx" code indicates that the client's
 354         # request was successfully received, understood, and accepted.
 355         if (200 <= errcode < 300):
 356             return addinfourl(fp, headers, "http:" + url, errcode)
 357         else:
 358             if data is None:
 359                 return self.http_error(url, fp, errcode, errmsg, headers)
 360             else:
 361                 return self.http_error(url, fp, errcode, errmsg, headers, data)
 362
 363     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
 364         """Handle http errors.
 365         Derived class can override this, or provide specific handlers
 366         named http_error_DDD where DDD is the 3-digit error code."""
 367         # First check if there's a specific handler for this error
 368         name = 'http_error_%d' % errcode
 369         if hasattr(self, name):
 370             method = getattr(self, name)
 371             if data is None:
 372                 result = method(url, fp, errcode, errmsg, headers)
 373             else:
 374                 result = method(url, fp, errcode, errmsg, headers, data)
 375             if result: return result
 376         return self.http_error_default(url, fp, errcode, errmsg, headers)
 377
 378     def http_error_default(self, url, fp, errcode, errmsg, headers):
 379         """Default error handler: close the connection and raise IOError."""
 380         void = fp.read()
 381         fp.close()
 382         raise IOError, ('http error', errcode, errmsg, headers)
 383
 384     if _have_ssl:
 385         def open_https(self, url, data=None):
 386             """Use HTTPS protocol."""
 387
 388             import httplib
 389             user_passwd = None
 390             proxy_passwd = None
 391             if isinstance(url, str):
 392                 host, selector = splithost(url)
 393                 if host:
 394                     user_passwd, host = splituser(host)
 395                     host = unquote(host)
 396                 realhost = host
 397             else:
 398                 host, selector = url
 399                 # here, we determine, whether the proxy contains authorization information
 400                 proxy_passwd, host = splituser(host)
 401                 urltype, rest = splittype(selector)
 402                 url = rest
 403                 user_passwd = None
 404                 if urltype.lower() != 'https':
 405                     realhost = None
 406                 else:
 407                     realhost, rest = splithost(rest)
 408                     if realhost:
 409                         user_passwd, realhost = splituser(realhost)
 410                     if user_passwd:
 411                         selector = "%s://%s%s" % (urltype, realhost, rest)
 412                 #print "proxy via https:", host, selector
 413             if not host: raise IOError, ('https error', 'no host given')
 414             if proxy_passwd:
 415                 import base64
 416                 proxy_auth = base64.b64encode(proxy_passwd).strip()
 417             else:
 418                 proxy_auth = None
 419             if user_passwd:
 420                 import base64
 421                 auth = base64.b64encode(user_passwd).strip()
 422             else:
 423                 auth = None
 424             h = httplib.HTTPS(host, 0,
 425                               key_file=self.key_file,
 426                               cert_file=self.cert_file)
 427             if data is not None:
 428                 h.putrequest('POST', selector)
 429                 h.putheader('Content-Type',
 430                             'application/x-www-form-urlencoded')
 431                 h.putheader('Content-Length', '%d' % len(data))
 432             else:
 433                 h.putrequest('GET', selector)
 434             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
 435             if auth: h.putheader('Authorization', 'Basic %s' % auth)
 436             if realhost: h.putheader('Host', realhost)
 437             for args in self.addheaders: h.putheader(*args)
 438             h.endheaders(data)
 439             errcode, errmsg, headers = h.getreply()
 440             fp = h.getfile()
 441             if errcode == -1:
 442                 if fp: fp.close()
 443                 # something went wrong with the HTTP status line
 444                 raise IOError, ('http protocol error', 0,
 445                                 'got a bad status line', None)
 446             # According to RFC 2616, "2xx" code indicates that the client's
 447             # request was successfully received, understood, and accepted.
 448             if (200 <= errcode < 300):
 449                 return addinfourl(fp, headers, "https:" + url, errcode)
 450             else:
 451                 if data is None:
 452                     return self.http_error(url, fp, errcode, errmsg, headers)
 453                 else:
 454                     return self.http_error(url, fp, errcode, errmsg, headers,
 455                                            data)
 456
 457     def open_file(self, url):
 458         """Use local file or FTP depending on form of URL."""
 459         if not isinstance(url, str):
 460             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
 461         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
 462             return self.open_ftp(url)
 463         else:
 464             return self.open_local_file(url)
 465
 466     def open_local_file(self, url):
 467         """Use local file."""
 468         import mimetypes, mimetools, email.utils
 469         try:
 470             from cStringIO import StringIO
 471         except ImportError:
 472             from StringIO import StringIO
 473         host, file = splithost(url)
 474         localname = url2pathname(file)
 475         try:
 476             stats = os.stat(localname)
 477         except OSError, e:
 478             raise IOError(e.errno, e.strerror, e.filename)
 479         size = stats.st_size
 480         modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
 481         mtype = mimetypes.guess_type(url)[0]
 482         headers = mimetools.Message(StringIO(
 483             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
 484             (mtype or 'text/plain', size, modified)))
 485         if not host:
 486             urlfile = file
 487             if file[:1] == '/':
 488                 urlfile = 'file://' + file
 489             return addinfourl(open(localname, 'rb'),
 490                               headers, urlfile)
 491         host, port = splitport(host)
 492         if not port \
 493            and socket.gethostbyname(host) in (localhost(), thishost()):
 494             urlfile = file
 495             if file[:1] == '/':
 496                 urlfile = 'file://' + file
 497             return addinfourl(open(localname, 'rb'),
 498                               headers, urlfile)
 499         raise IOError, ('local file error', 'not on local host')
 500
 501     def open_ftp(self, url):
 502         """Use FTP protocol."""
 503         if not isinstance(url, str):
 504             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
 505         import mimetypes, mimetools
 506         try:
 507             from cStringIO import StringIO
 508         except ImportError:
 509             from StringIO import StringIO
 510         host, path = splithost(url)
 511         if not host: raise IOError, ('ftp error', 'no host given')
 512         host, port = splitport(host)
 513         user, host = splituser(host)
 514         if user: user, passwd = splitpasswd(user)
 515         else: passwd = None
 516         host = unquote(host)
 517         user = unquote(user or '')
 518         passwd = unquote(passwd or '')
 519         host = socket.gethostbyname(host)
 520         if not port:
 521             import ftplib
 522             port = ftplib.FTP_PORT
 523         else:
 524             port = int(port)
 525         path, attrs = splitattr(path)
 526         path = unquote(path)
 527         dirs = path.split('/')
 528         dirs, file = dirs[:-1], dirs[-1]
 529         if dirs and not dirs[0]: dirs = dirs[1:]
 530         if dirs and not dirs[0]: dirs[0] = '/'
 531         key = user, host, port, '/'.join(dirs)
 532         # XXX thread unsafe!
 533         if len(self.ftpcache) > MAXFTPCACHE:
 534             # Prune the cache, rather arbitrarily
 535             for k in self.ftpcache.keys():
 536                 if k != key:
 537                     v = self.ftpcache[k]
 538                     del self.ftpcache[k]
 539                     v.close()
 540         try:
 541             if not key in self.ftpcache:
 542                 self.ftpcache[key] = \
 543                     ftpwrapper(user, passwd, host, port, dirs)
 544             if not file: type = 'D'
 545             else: type = 'I'
 546             for attr in attrs:
 547                 attr, value = splitvalue(attr)
 548                 if attr.lower() == 'type' and \
 549                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
 550                     type = value.upper()
 551             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
 552             mtype = mimetypes.guess_type("ftp:" + url)[0]
 553             headers = ""
 554             if mtype:
 555                 headers += "Content-Type: %s\n" % mtype
 556             if retrlen is not None and retrlen >= 0:
 557                 headers += "Content-Length: %d\n" % retrlen
 558             headers = mimetools.Message(StringIO(headers))
 559             return addinfourl(fp, headers, "ftp:" + url)
 560         except ftperrors(), msg:
 561             raise IOError, ('ftp error', msg), sys.exc_info()[2]
 562
 563     def open_data(self, url, data=None):
 564         """Use "data" URL."""
 565         if not isinstance(url, str):
 566             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
 567         # ignore POSTed data
 568         #
 569         # syntax of data URLs:
 570         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 571         # mediatype := [ type "/" subtype ] *( ";" parameter )
 572         # data      := *urlchar
 573         # parameter := attribute "=" value
 574         import mimetools
 575         try:
 576             from cStringIO import StringIO
 577         except ImportError:
 578             from StringIO import StringIO
 579         try:
 580             [type, data] = url.split(',', 1)
 581         except ValueError:
 582             raise IOError, ('data error', 'bad data URL')
 583         if not type:
 584             type = 'text/plain;charset=US-ASCII'
 585         semi = type.rfind(';')
 586         if semi >= 0 and '=' not in type[semi:]:
 587             encoding = type[semi+1:]
 588             type = type[:semi]
 589         else:
 590             encoding = ''
 591         msg = []
 592         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
 593                                             time.gmtime(time.time())))
 594         msg.append('Content-type: %s' % type)
 595         if encoding == 'base64':
 596             import base64
 597             data = base64.decodestring(data)
 598         else:
 599             data = unquote(data)
 600         msg.append('Content-Length: %d' % len(data))
 601         msg.append('')
 602         msg.append(data)
 603         msg = '\n'.join(msg)
 604         f = StringIO(msg)
 605         headers = mimetools.Message(f, 0)
 606         #f.fileno = None     # needed for addinfourl
 607         return addinfourl(f, headers, url)
 608
 609
 610 class FancyURLopener(URLopener):
 611     """Derived class with handlers for errors we can handle (perhaps)."""
 612
 613     def __init__(self, *args, **kwargs):
 614         URLopener.__init__(self, *args, **kwargs)
 615         self.auth_cache = {}
 616         self.tries = 0
 617         self.maxtries = 10
 618
 619     def http_error_default(self, url, fp, errcode, errmsg, headers):
 620         """Default error handling -- don't raise an exception."""
 621         return addinfourl(fp, headers, "http:" + url, errcode)
 622
 623     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
 624         """Error 302 -- relocated (temporarily)."""
 625         self.tries += 1
 626         if self.maxtries and self.tries >= self.maxtries:
 627             if hasattr(self, "http_error_500"):
 628                 meth = self.http_error_500
 629             else:
 630                 meth = self.http_error_default
 631             self.tries = 0
 632             return meth(url, fp, 500,
 633                         "Internal Server Error: Redirect Recursion", headers)
 634         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
 635                                         data)
 636         self.tries = 0
 637         return result
 638
 639     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
 640         if 'location' in headers:
 641             newurl = headers['location']
 642         elif 'uri' in headers:
 643             newurl = headers['uri']
 644         else:
 645             return
 646         void = fp.read()
 647         fp.close()
 648         # In case the server sent a relative URL, join with original:
 649         newurl = basejoin(self.type + ":" + url, newurl)
 650         return self.open(newurl)
 651
 652     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
 653         """Error 301 -- also relocated (permanently)."""
 654         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 655
 656     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
 657         """Error 303 -- also relocated (essentially identical to 302)."""
 658         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 659
 660     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
 661         """Error 307 -- relocated, but turn POST into error."""
 662         if data is None:
 663             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
 664         else:
 665             return self.http_error_default(url, fp, errcode, errmsg, headers)
 666
 667     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
 668         """Error 401 -- authentication required.
 669         This function supports Basic authentication only."""
 670         if not 'www-authenticate' in headers:
 671             URLopener.http_error_default(self, url, fp,
 672                                          errcode, errmsg, headers)
 673         stuff = headers['www-authenticate']
 674         import re
 675         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 676         if not match:
 677             URLopener.http_error_default(self, url, fp,
 678                                          errcode, errmsg, headers)
 679         scheme, realm = match.groups()
 680         if scheme.lower() != 'basic':
 681             URLopener.http_error_default(self, url, fp,
 682                                          errcode, errmsg, headers)
 683         name = 'retry_' + self.type + '_basic_auth'
 684         if data is None:
 685             return getattr(self,name)(url, realm)
 686         else:
 687             return getattr(self,name)(url, realm, data)
 688
 689     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
 690         """Error 407 -- proxy authentication required.
 691         This function supports Basic authentication only."""
 692         if not 'proxy-authenticate' in headers:
 693             URLopener.http_error_default(self, url, fp,
 694                                          errcode, errmsg, headers)
 695         stuff = headers['proxy-authenticate']
 696         import re
 697         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
 698         if not match:
 699             URLopener.http_error_default(self, url, fp,
 700                                          errcode, errmsg, headers)
 701         scheme, realm = match.groups()
 702         if scheme.lower() != 'basic':
 703             URLopener.http_error_default(self, url, fp,
 704                                          errcode, errmsg, headers)
 705         name = 'retry_proxy_' + self.type + '_basic_auth'
 706         if data is None:
 707             return getattr(self,name)(url, realm)
 708         else:
 709             return getattr(self,name)(url, realm, data)
 710
 711     def retry_proxy_http_basic_auth(self, url, realm, data=None):
 712         host, selector = splithost(url)
 713         newurl = 'http://' + host + selector
 714         proxy = self.proxies['http']
 715         urltype, proxyhost = splittype(proxy)
 716         proxyhost, proxyselector = splithost(proxyhost)
 717         i = proxyhost.find('@') + 1
 718         proxyhost = proxyhost[i:]
 719         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 720         if not (user or passwd): return None
 721         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 722         self.proxies['http'] = 'http://' + proxyhost + proxyselector
 723         if data is None:
 724             return self.open(newurl)
 725         else:
 726             return self.open(newurl, data)
 727
 728     def retry_proxy_https_basic_auth(self, url, realm, data=None):
 729         host, selector = splithost(url)
 730         newurl = 'https://' + host + selector
 731         proxy = self.proxies['https']
 732         urltype, proxyhost = splittype(proxy)
 733         proxyhost, proxyselector = splithost(proxyhost)
 734         i = proxyhost.find('@') + 1
 735         proxyhost = proxyhost[i:]
 736         user, passwd = self.get_user_passwd(proxyhost, realm, i)
 737         if not (user or passwd): return None
 738         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
 739         self.proxies['https'] = 'https://' + proxyhost + proxyselector
 740         if data is None:
 741             return self.open(newurl)
 742         else:
 743             return self.open(newurl, data)
 744
 745     def retry_http_basic_auth(self, url, realm, data=None):
 746         host, selector = splithost(url)
 747         i = host.find('@') + 1
 748         host = host[i:]
 749         user, passwd = self.get_user_passwd(host, realm, i)
 750         if not (user or passwd): return None
 751         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 752         newurl = 'http://' + host + selector
 753         if data is None:
 754             return self.open(newurl)
 755         else:
 756             return self.open(newurl, data)
 757
 758     def retry_https_basic_auth(self, url, realm, data=None):
 759         host, selector = splithost(url)
 760         i = host.find('@') + 1
 761         host = host[i:]
 762         user, passwd = self.get_user_passwd(host, realm, i)
 763         if not (user or passwd): return None
 764         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
 765         newurl = 'https://' + host + selector
 766         if data is None:
 767             return self.open(newurl)
 768         else:
 769             return self.open(newurl, data)
 770
 771     def get_user_passwd(self, host, realm, clear_cache = 0):
 772         key = realm + '@' + host.lower()
 773         if key in self.auth_cache:
 774             if clear_cache:
 775                 del self.auth_cache[key]
 776             else:
 777                 return self.auth_cache[key]
 778         user, passwd = self.prompt_user_passwd(host, realm)
 779         if user or passwd: self.auth_cache[key] = (user, passwd)
 780         return user, passwd
 781
 782     def prompt_user_passwd(self, host, realm):
 783         """Override this in a GUI environment!"""
 784         import getpass
 785         try:
 786             user = raw_input("Enter username for %s at %s: " % (realm,
 787                                                                 host))
 788             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
 789                 (user, realm, host))
 790             return user, passwd
 791         except KeyboardInterrupt:
 792             print
 793             return None, None
 794
 795
 796 # Utility functions
 797
 798 _localhost = None
 799 def localhost():
 800     """Return the IP address of the magic hostname 'localhost'."""
 801     global _localhost
 802     if _localhost is None:
 803         _localhost = socket.gethostbyname('localhost')
 804     return _localhost
 805
 806 _thishost = None
 807 def thishost():
 808     """Return the IP address of the current host."""
 809     global _thishost
 810     if _thishost is None:
 811         _thishost = socket.gethostbyname(socket.gethostname())
 812     return _thishost
 813
 814 _ftperrors = None
 815 def ftperrors():
 816     """Return the set of errors raised by the FTP class."""
 817     global _ftperrors
 818     if _ftperrors is None:
 819         import ftplib
 820         _ftperrors = ftplib.all_errors
 821     return _ftperrors
 822
 823 _noheaders = None
 824 def noheaders():
 825     """Return an empty mimetools.Message object."""
 826     global _noheaders
 827     if _noheaders is None:
 828         import mimetools
 829         try:
 830             from cStringIO import StringIO
 831         except ImportError:
 832             from StringIO import StringIO
 833         _noheaders = mimetools.Message(StringIO(), 0)
 834         _noheaders.fp.close()   # Recycle file descriptor
 835     return _noheaders
 836
 837
 838 # Utility classes
 839
 840 class ftpwrapper:
 841     """Class used by open_ftp() for cache of open FTP connections."""
 842
 843     def __init__(self, user, passwd, host, port, dirs,
 844                  timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
 845         self.user = user
 846         self.passwd = passwd
 847         self.host = host
 848         self.port = port
 849         self.dirs = dirs
 850         self.timeout = timeout
 851         self.init()
 852
 853     def init(self):
 854         import ftplib
 855         self.busy = 0
 856         self.ftp = ftplib.FTP()
 857         self.ftp.connect(self.host, self.port, self.timeout)
 858         self.ftp.login(self.user, self.passwd)
 859         for dir in self.dirs:
 860             self.ftp.cwd(dir)
 861
 862     def retrfile(self, file, type):
 863         import ftplib
 864         self.endtransfer()
 865         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
 866         else: cmd = 'TYPE ' + type; isdir = 0
 867         try:
 868             self.ftp.voidcmd(cmd)
 869         except ftplib.all_errors:
 870             self.init()
 871             self.ftp.voidcmd(cmd)
 872         conn = None
 873         if file and not isdir:
 874             # Try to retrieve as a file
 875             try:
 876                 cmd = 'RETR ' + file
 877                 conn = self.ftp.ntransfercmd(cmd)
 878             except ftplib.error_perm, reason:
 879                 if str(reason)[:3] != '550':
 880                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
 881         if not conn:
 882             # Set transfer mode to ASCII!
 883             self.ftp.voidcmd('TYPE A')
 884             # Try a directory listing. Verify that directory exists.
 885             if file:
 886                 pwd = self.ftp.pwd()
 887                 try:
 888                     try:
 889                         self.ftp.cwd(file)
 890                     except ftplib.error_perm, reason:
 891                         raise IOError, ('ftp error', reason), sys.exc_info()[2]
 892                 finally:
 893                     self.ftp.cwd(pwd)
 894                 cmd = 'LIST ' + file
 895             else:
 896                 cmd = 'LIST'
 897             conn = self.ftp.ntransfercmd(cmd)
 898         self.busy = 1
 899         # Pass back both a suitably decorated object and a retrieval length
 900         return (addclosehook(conn[0].makefile('rb'),
 901                              self.endtransfer), conn[1])
 902     def endtransfer(self):
 903         if not self.busy:
 904             return
 905         self.busy = 0
 906         try:
 907             self.ftp.voidresp()
 908         except ftperrors():
 909             pass
 910
 911     def close(self):
 912         self.endtransfer()
 913         try:
 914             self.ftp.close()
 915         except ftperrors():
 916             pass
 917
 918 class addbase:
 919     """Base class for addinfo and addclosehook."""
 920
 921     def __init__(self, fp):
 922         self.fp = fp
 923         self.read = self.fp.read
 924         self.readline = self.fp.readline
 925         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
 926         if hasattr(self.fp, "fileno"):
 927             self.fileno = self.fp.fileno
 928         else:
 929             self.fileno = lambda: None
 930         if hasattr(self.fp, "__iter__"):
 931             self.__iter__ = self.fp.__iter__
 932             if hasattr(self.fp, "next"):
 933                 self.next = self.fp.next
 934
 935     def __repr__(self):
 936         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
 937                                              id(self), self.fp)
 938
 939     def close(self):
 940         self.read = None
 941         self.readline = None
 942         self.readlines = None
 943         self.fileno = None
 944         if self.fp: self.fp.close()
 945         self.fp = None
 946
 947 class addclosehook(addbase):
 948     """Class to add a close hook to an open file."""
 949
 950     def __init__(self, fp, closehook, *hookargs):
 951         addbase.__init__(self, fp)
 952         self.closehook = closehook
 953         self.hookargs = hookargs
 954
 955     def close(self):
 956         addbase.close(self)
 957         if self.closehook:
 958             self.closehook(*self.hookargs)
 959             self.closehook = None
 960             self.hookargs = None
 961
 962 class addinfo(addbase):
 963     """class to add an info() method to an open file."""
 964
 965     def __init__(self, fp, headers):
 966         addbase.__init__(self, fp)
 967         self.headers = headers
 968
 969     def info(self):
 970         return self.headers
 971
 972 class addinfourl(addbase):
 973     """class to add info() and geturl() methods to an open file."""
 974
 975     def __init__(self, fp, headers, url, code=None):
 976         addbase.__init__(self, fp)
 977         self.headers = headers
 978         self.url = url
 979         self.code = code
 980
 981     def info(self):
 982         return self.headers
 983
 984     def getcode(self):
 985         return self.code
 986
 987     def geturl(self):
 988         return self.url
 989
 990
 991 # Utilities to parse URLs (most of these return None for missing parts):
 992 # unwrap('<URL:type://host/path>') --> 'type://host/path'
 993 # splittype('type:opaquestring') --> 'type', 'opaquestring'
 994 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
 995 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
 996 # splitpasswd('user:passwd') -> 'user', 'passwd'
 997 # splitport('host:port') --> 'host', 'port'
 998 # splitquery('/path?query') --> '/path', 'query'
 999 # splittag('/path#tag') --> '/path', 'tag'
1000 # splitattr('/path;attr1=value1;attr2=value2;...') ->
1001 #   '/path', ['attr1=value1', 'attr2=value2', ...]
1002 # splitvalue('attr=value') --> 'attr', 'value'
1003 # unquote('abc%20def') -> 'abc def'
1004 # quote('abc def') -> 'abc%20def')
1005
1006 try:
1007     unicode
1008 except NameError:
1009     def _is_unicode(x):
1010         return 0
1011 else:
1012     def _is_unicode(x):
1013         return isinstance(x, unicode)
1014
1015 def toBytes(url):
1016     """toBytes(u"URL") --> 'URL'."""
1017     # Most URL schemes require ASCII. If that changes, the conversion
1018     # can be relaxed
1019     if _is_unicode(url):
1020         try:
1021             url = url.encode("ASCII")
1022         except UnicodeError:
1023             raise UnicodeError("URL " + repr(url) +
1024                                " contains non-ASCII characters")
1025     return url
1026
1027 def unwrap(url):
1028     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1029     url = url.strip()
1030     if url[:1] == '<' and url[-1:] == '>':
1031         url = url[1:-1].strip()
1032     if url[:4] == 'URL:': url = url[4:].strip()
1033     return url
1034
1035 _typeprog = None
1036 def splittype(url):
1037     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1038     global _typeprog
1039     if _typeprog is None:
1040         import re
1041         _typeprog = re.compile('^([^/:]+):')
1042
1043     match = _typeprog.match(url)
1044     if match:
1045         scheme = match.group(1)
1046         return scheme.lower(), url[len(scheme) + 1:]
1047     return None, url
1048
1049 _hostprog = None
1050 def splithost(url):
1051     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1052     global _hostprog
1053     if _hostprog is None:
1054         import re
1055         _hostprog = re.compile('^//([^/?]*)(.*)$')
1056
1057     match = _hostprog.match(url)
1058     if match: return match.group(1, 2)
1059     return None, url
1060
1061 _userprog = None
1062 def splituser(host):
1063     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1064     global _userprog
1065     if _userprog is None:
1066         import re
1067         _userprog = re.compile('^(.*)@(.*)$')
1068
1069     match = _userprog.match(host)
1070     if match: return map(unquote, match.group(1, 2))
1071     return None, host
1072
1073 _passwdprog = None
1074 def splitpasswd(user):
1075     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1076     global _passwdprog
1077     if _passwdprog is None:
1078         import re
1079         _passwdprog = re.compile('^([^:]*):(.*)$',re.S)
1080
1081     match = _passwdprog.match(user)
1082     if match: return match.group(1, 2)
1083     return user, None
1084
1085 # splittag('/path#tag') --> '/path', 'tag'
1086 _portprog = None
1087 def splitport(host):
1088     """splitport('host:port') --> 'host', 'port'."""
1089     global _portprog
1090     if _portprog is None:
1091         import re
1092         _portprog = re.compile('^(.*):([0-9]+)$')
1093
1094     match = _portprog.match(host)
1095     if match: return match.group(1, 2)
1096     return host, None
1097
1098 _nportprog = None
1099 def splitnport(host, defport=-1):
1100     """Split host and port, returning numeric port.
1101     Return given default port if no ':' found; defaults to -1.
1102     Return numerical port if a valid number are found after ':'.
1103     Return None if ':' but not a valid number."""
1104     global _nportprog
1105     if _nportprog is None:
1106         import re
1107         _nportprog = re.compile('^(.*):(.*)$')
1108
1109     match = _nportprog.match(host)
1110     if match:
1111         host, port = match.group(1, 2)
1112         try:
1113             if not port: raise ValueError, "no digits"
1114             nport = int(port)
1115         except ValueError:
1116             nport = None
1117         return host, nport
1118     return host, defport
1119
1120 _queryprog = None
1121 def splitquery(url):
1122     """splitquery('/path?query') --> '/path', 'query'."""
1123     global _queryprog
1124     if _queryprog is None:
1125         import re
1126         _queryprog = re.compile('^(.*)\?([^?]*)$')
1127
1128     match = _queryprog.match(url)
1129     if match: return match.group(1, 2)
1130     return url, None
1131
1132 _tagprog = None
1133 def splittag(url):
1134     """splittag('/path#tag') --> '/path', 'tag'."""
1135     global _tagprog
1136     if _tagprog is None:
1137         import re
1138         _tagprog = re.compile('^(.*)#([^#]*)$')
1139
1140     match = _tagprog.match(url)
1141     if match: return match.group(1, 2)
1142     return url, None
1143
1144 def splitattr(url):
1145     """splitattr('/path;attr1=value1;attr2=value2;...') ->
1146         '/path', ['attr1=value1', 'attr2=value2', ...]."""
1147     words = url.split(';')
1148     return words[0], words[1:]
1149
1150 _valueprog = None
1151 def splitvalue(attr):
1152     """splitvalue('attr=value') --> 'attr', 'value'."""
1153     global _valueprog
1154     if _valueprog is None:
1155         import re
1156         _valueprog = re.compile('^([^=]*)=(.*)$')
1157
1158     match = _valueprog.match(attr)
1159     if match: return match.group(1, 2)
1160     return attr, None
1161
1162 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1163 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1164
1165 def unquote(s):
1166     """unquote('abc%20def') -> 'abc def'."""
1167     res = s.split('%')
1168     for i in xrange(1, len(res)):
1169         item = res[i]
1170         try:
1171             res[i] = _hextochr[item[:2]] + item[2:]
1172         except KeyError:
1173             res[i] = '%' + item
1174         except UnicodeDecodeError:
1175             res[i] = unichr(int(item[:2], 16)) + item[2:]
1176     return "".join(res)
1177
1178 def unquote_plus(s):
1179     """unquote('%7e/abc+def') -> '~/abc def'"""
1180     s = s.replace('+', ' ')
1181     return unquote(s)
1182
1183 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1184                'abcdefghijklmnopqrstuvwxyz'
1185                '0123456789' '_.-')
1186 _safemaps = {}
1187
1188 def quote(s, safe = '/'):
1189     """quote('abc def') -> 'abc%20def'
1190
1191     Each part of a URL, e.g. the path info, the query, etc., has a
1192     different set of reserved characters that must be quoted.
1193
1194     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1195     the following reserved characters.
1196
1197     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1198                   "$" | ","
1199
1200     Each of these characters is reserved in some component of a URL,
1201     but not necessarily in all of them.
1202
1203     By default, the quote function is intended for quoting the path
1204     section of a URL.  Thus, it will not encode '/'.  This character
1205     is reserved, but in typical usage the quote function is being
1206     called on a path where the existing slash characters are used as
1207     reserved characters.
1208     """
1209     cachekey = (safe, always_safe)
1210     try:
1211         safe_map = _safemaps[cachekey]
1212     except KeyError:
1213         safe += always_safe
1214         safe_map = {}
1215         for i in range(256):
1216             c = chr(i)
1217             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1218         _safemaps[cachekey] = safe_map
1219     res = map(safe_map.__getitem__, s)
1220     return ''.join(res)
1221
1222 def quote_plus(s, safe = ''):
1223     """Quote the query fragment of a URL; replacing ' ' with '+'"""
1224     if ' ' in s:
1225         s = quote(s, safe + ' ')
1226         return s.replace(' ', '+')
1227     return quote(s, safe)
1228
1229 def urlencode(query,doseq=0):
1230     """Encode a sequence of two-element tuples or dictionary into a URL query string.
1231
1232     If any values in the query arg are sequences and doseq is true, each
1233     sequence element is converted to a separate parameter.
1234
1235     If the query arg is a sequence of two-element tuples, the order of the
1236     parameters in the output will match the order of parameters in the
1237     input.
1238     """
1239
1240     if hasattr(query,"items"):
1241         # mapping objects
1242         query = query.items()
1243     else:
1244         # it's a bother at times that strings and string-like objects are
1245         # sequences...
1246         try:
1247             # non-sequence items should not work with len()
1248             # non-empty strings will fail this
1249             if len(query) and not isinstance(query[0], tuple):
1250                 raise TypeError
1251             # zero-length sequences of all types will get here and succeed,
1252             # but that's a minor nit - since the original implementation
1253             # allowed empty dicts that type of behavior probably should be
1254             # preserved for consistency
1255         except TypeError:
1256             ty,va,tb = sys.exc_info()
1257             raise TypeError, "not a valid non-string sequence or mapping object", tb
1258
1259     l = []
1260     if not doseq:
1261         # preserve old behavior
1262         for k, v in query:
1263             k = quote_plus(str(k))
1264             v = quote_plus(str(v))
1265             l.append(k + '=' + v)
1266     else:
1267         for k, v in query:
1268             k = quote_plus(str(k))
1269             if isinstance(v, str):
1270                 v = quote_plus(v)
1271                 l.append(k + '=' + v)
1272             elif _is_unicode(v):
1273                 # is there a reasonable way to convert to ASCII?
1274                 # encode generates a string, but "replace" or "ignore"
1275                 # lose information and "strict" can raise UnicodeError
1276                 v = quote_plus(v.encode("ASCII","replace"))
1277                 l.append(k + '=' + v)
1278             else:
1279                 try:
1280                     # is this a sufficient test for sequence-ness?
1281                     x = len(v)
1282                 except TypeError:
1283                     # not a sequence
1284                     v = quote_plus(str(v))
1285                     l.append(k + '=' + v)
1286                 else:
1287                     # loop over the sequence
1288                     for elt in v:
1289                         l.append(k + '=' + quote_plus(str(elt)))
1290     return '&'.join(l)
1291
1292 # Proxy handling
1293 def getproxies_environment():
1294     """Return a dictionary of scheme -> proxy server URL mappings.
1295
1296     Scan the environment for variables named <scheme>_proxy;
1297     this seems to be the standard convention.  If you need a
1298     different way, you can pass a proxies dictionary to the
1299     [Fancy]URLopener constructor.
1300
1301     """
1302     proxies = {}
1303     for name, value in os.environ.items():
1304         name = name.lower()
1305         if value and name[-6:] == '_proxy':
1306             proxies[name[:-6]] = value
1307     return proxies
1308
1309 def proxy_bypass_environment(host):
1310     """Test if proxies should not be used for a particular host.
1311
1312     Checks the environment for a variable named no_proxy, which should
1313     be a list of DNS suffixes separated by commas, or '*' for all hosts.
1314     """
1315     no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '')
1316     # '*' is special case for always bypass
1317     if no_proxy == '*':
1318         return 1
1319     # strip port off host
1320     hostonly, port = splitport(host)
1321     # check if the host ends with any of the DNS suffixes
1322     for name in no_proxy.split(','):
1323         if name and (hostonly.endswith(name) or host.endswith(name)):
1324             return 1
1325     # otherwise, don't bypass
1326     return 0
1327
1328
1329 if sys.platform == 'darwin':
1330     from _scproxy import _get_proxy_settings, _get_proxies
1331
1332     def proxy_bypass_macosx_sysconf(host):
1333         """
1334         Return True iff this host shouldn't be accessed using a proxy
1335
1336         This function uses the MacOSX framework SystemConfiguration
1337         to fetch the proxy information.
1338         """
1339         import re
1340         import socket
1341         from fnmatch import fnmatch
1342
1343         hostonly, port = splitport(host)
1344
1345         def ip2num(ipAddr):
1346             parts = ipAddr.split('.')
1347             parts = map(int, parts)
1348             if len(parts) != 4:
1349                 parts = (parts + [0, 0, 0, 0])[:4]
1350             return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3]
1351
1352         proxy_settings = _get_proxy_settings()
1353
1354         # Check for simple host names:
1355         if '.' not in host:
1356             if proxy_settings['exclude_simple']:
1357                 return True
1358
1359         hostIP = None
1360
1361         for value in proxy_settings.get('exceptions', ()):
1362             # Items in the list are strings like these: *.local, 169.254/16
1363             if not value: continue
1364
1365             m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value)
1366             if m is not None:
1367                 if hostIP is None:
1368                     try:
1369                         hostIP = socket.gethostbyname(hostonly)
1370                         hostIP = ip2num(hostIP)
1371                     except socket.error:
1372                         continue
1373
1374                 base = ip2num(m.group(1))
1375                 mask = int(m.group(2)[1:])
1376                 mask = 32 - mask
1377
1378                 if (hostIP >> mask) == (base >> mask):
1379                     return True
1380
1381             elif fnmatch(host, value):
1382                 return True
1383
1384         return False
1385
1386
1387     def getproxies_macosx_sysconf():
1388         """Return a dictionary of scheme -> proxy server URL mappings.
1389
1390         This function uses the MacOSX framework SystemConfiguration
1391         to fetch the proxy information.
1392         """
1393         return _get_proxies()
1394
1395
1396
1397     def proxy_bypass(host):
1398         if getproxies_environment():
1399             return proxy_bypass_environment(host)
1400         else:
1401             return proxy_bypass_macosx_sysconf(host)
1402
1403     def getproxies():
1404         return getproxies_environment() or getproxies_macosx_sysconf()
1405
1406 elif os.name == 'nt':
1407     def getproxies_registry():
1408         """Return a dictionary of scheme -> proxy server URL mappings.
1409
1410         Win32 uses the registry to store proxies.
1411
1412         """
1413         proxies = {}
1414         try:
1415             import _winreg
1416         except ImportError:
1417             # Std module, so should be around - but you never know!
1418             return proxies
1419         try:
1420             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1421                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1422             proxyEnable = _winreg.QueryValueEx(internetSettings,
1423                                                'ProxyEnable')[0]
1424             if proxyEnable:
1425                 # Returned as Unicode but problems if not converted to ASCII
1426                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1427                                                        'ProxyServer')[0])
1428                 if '=' in proxyServer:
1429                     # Per-protocol settings
1430                     for p in proxyServer.split(';'):
1431                         protocol, address = p.split('=', 1)
1432                         # See if address has a type:// prefix
1433                         import re
1434                         if not re.match('^([^/:]+)://', address):
1435                             address = '%s://%s' % (protocol, address)
1436                         proxies[protocol] = address
1437                 else:
1438                     # Use one setting for all protocols
1439                     if proxyServer[:5] == 'http:':
1440                         proxies['http'] = proxyServer
1441                     else:
1442                         proxies['http'] = 'http://%s' % proxyServer
1443                         proxies['ftp'] = 'ftp://%s' % proxyServer
1444             internetSettings.Close()
1445         except (WindowsError, ValueError, TypeError):
1446             # Either registry key not found etc, or the value in an
1447             # unexpected format.
1448             # proxies already set up to be empty so nothing to do
1449             pass
1450         return proxies
1451
1452     def getproxies():
1453         """Return a dictionary of scheme -> proxy server URL mappings.
1454
1455         Returns settings gathered from the environment, if specified,
1456         or the registry.
1457
1458         """
1459         return getproxies_environment() or getproxies_registry()
1460
1461     def proxy_bypass_registry(host):
1462         try:
1463             import _winreg
1464             import re
1465         except ImportError:
1466             # Std modules, so should be around - but you never know!
1467             return 0
1468         try:
1469             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1470                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1471             proxyEnable = _winreg.QueryValueEx(internetSettings,
1472                                                'ProxyEnable')[0]
1473             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1474                                                      'ProxyOverride')[0])
1475             # ^^^^ Returned as Unicode but problems if not converted to ASCII
1476         except WindowsError:
1477             return 0
1478         if not proxyEnable or not proxyOverride:
1479             return 0
1480         # try to make a host list from name and IP address.
1481         rawHost, port = splitport(host)
1482         host = [rawHost]
1483         try:
1484             addr = socket.gethostbyname(rawHost)
1485             if addr != rawHost:
1486                 host.append(addr)
1487         except socket.error:
1488             pass
1489         try:
1490             fqdn = socket.getfqdn(rawHost)
1491             if fqdn != rawHost:
1492                 host.append(fqdn)
1493         except socket.error:
1494             pass
1495         # make a check value list from the registry entry: replace the
1496         # '<local>' string by the localhost entry and the corresponding
1497         # canonical entry.
1498         proxyOverride = proxyOverride.split(';')
1499         # now check if we match one of the registry values.
1500         for test in proxyOverride:
1501             if test == '<local>':
1502                 if '.' not in rawHost:
1503                     return 1
1504             test = test.replace(".", r"\.")     # mask dots
1505             test = test.replace("*", r".*")     # change glob sequence
1506             test = test.replace("?", r".")      # change glob char
1507             for val in host:
1508                 # print "%s <--> %s" %( test, val )
1509                 if re.match(test, val, re.I):
1510                     return 1
1511         return 0
1512
1513     def proxy_bypass(host):
1514         """Return a dictionary of scheme -> proxy server URL mappings.
1515
1516         Returns settings gathered from the environment, if specified,
1517         or the registry.
1518
1519         """
1520         if getproxies_environment():
1521             return proxy_bypass_environment(host)
1522         else:
1523             return proxy_bypass_registry(host)
1524
1525 else:
1526     # By default use environment variables
1527     getproxies = getproxies_environment
1528     proxy_bypass = proxy_bypass_environment
1529
1530 # Test and time quote() and unquote()
1531 def test1():
1532     s = ''
1533     for i in range(256): s = s + chr(i)
1534     s = s*4
1535     t0 = time.time()
1536     qs = quote(s)
1537     uqs = unquote(qs)
1538     t1 = time.time()
1539     if uqs != s:
1540         print 'Wrong!'
1541     print repr(s)
1542     print repr(qs)
1543     print repr(uqs)
1544     print round(t1 - t0, 3), 'sec'
1545
1546
1547 def reporthook(blocknum, blocksize, totalsize):
1548     # Report during remote transfers
1549     print "Block number: %d, Block size: %d, Total size: %d" % (
1550         blocknum, blocksize, totalsize)
1551
1552 # Test program
1553 def test(args=[]):
1554     if not args:
1555         args = [
1556             '/etc/passwd',
1557             'file:/etc/passwd',
1558             'file://localhost/etc/passwd',
1559             'ftp://ftp.gnu.org/pub/README',
1560             'http://www.python.org/index.html',
1561             ]
1562         if hasattr(URLopener, "open_https"):
1563             args.append('https://synergy.as.cmu.edu/~geek/')
1564     try:
1565         for url in args:
1566             print '-'*10, url, '-'*10
1567             fn, h = urlretrieve(url, None, reporthook)
1568             print fn
1569             if h:
1570                 print '======'
1571                 for k in h.keys(): print k + ':', h[k]
1572                 print '======'
1573             with open(fn, 'rb') as fp:
1574                 data = fp.read()
1575             if '\r' in data:
1576                 table = string.maketrans("", "")
1577                 data = data.translate(table, "\r")
1578             print data
1579             fn, h = None, None
1580         print '-'*40
1581     finally:
1582         urlcleanup()
1583
1584 def main():
1585     import getopt, sys
1586     try:
1587         opts, args = getopt.getopt(sys.argv[1:], "th")
1588     except getopt.error, msg:
1589         print msg
1590         print "Use -h for help"
1591         return
1592     t = 0
1593     for o, a in opts:
1594         if o == '-t':
1595             t = t + 1
1596         if o == '-h':
1597             print "Usage: python urllib.py [-t] [url ...]"
1598             print "-t runs self-test;",
1599             print "otherwise, contents of urls are printed"
1600             return
1601     if t:
1602         if t > 1:
1603             test1()
1604         test(args)
1605     else:
1606         if not args:
1607             print "Use -h for help"
1608         for url in args:
1609             print urlopen(url).read(),
1610
1611 # Run test program when run as a script
1612 if __name__ == '__main__':
1613     main()