1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
30 from urlparse
import urljoin
as basejoin
33 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
34 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
35 "urlencode", "url2pathname", "pathname2url", "splittag",
36 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
37 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
38 "splitnport", "splitquery", "splitattr", "splitvalue",
41 __version__
= '1.17' # XXX This version is not always updated :-(
43 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
45 # Helper for non-unix systems
47 from macurl2path
import url2pathname
, pathname2url
49 from nturl2path
import url2pathname
, pathname2url
50 elif os
.name
== 'riscos':
51 from rourl2path
import url2pathname
, pathname2url
53 def url2pathname(pathname
):
54 """OS-specific conversion from a relative URL of the 'file' scheme
55 to a file system path; not recommended for general use."""
56 return unquote(pathname
)
58 def pathname2url(pathname
):
59 """OS-specific conversion from a file system path to a relative URL
60 of the 'file' scheme; not recommended for general use."""
61 return quote(pathname
)
63 # This really consists of two pieces:
64 # (1) a class which handles opening of all sorts of URLs
65 # (plus assorted utilities etc.)
66 # (2) a set of functions for parsing URLs
67 # XXX Should these be separated out into different modules?
70 # Shortcut for basic usage
72 def urlopen(url
, data
=None, proxies
=None):
73 """Create a file-like object for the specified URL to read from."""
74 from warnings
import warnpy3k
75 warnings
.warnpy3k("urllib.urlopen() has been removed in Python 3.0 in "
76 "favor of urllib2.urlopen()", stacklevel
=2)
79 if proxies
is not None:
80 opener
= FancyURLopener(proxies
=proxies
)
82 opener
= FancyURLopener()
87 return opener
.open(url
)
89 return opener
.open(url
, data
)
90 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
93 _urlopener
= FancyURLopener()
94 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
107 # exception raised when downloaded size does not match content-length
108 class ContentTooShortError(IOError):
109 def __init__(self
, message
, content
):
110 IOError.__init
__(self
, message
)
111 self
.content
= content
115 """Class to open URLs.
116 This is a class rather than just a subroutine because we may need
117 more than one set of global protocol-specific options.
118 Note -- this is a base class for those who don't want the
119 automatic handling of errors type 302 (relocated) and 401
120 (authorization needed)."""
124 version
= "Python-urllib/%s" % __version__
127 def __init__(self
, proxies
=None, **x509
):
129 proxies
= getproxies()
130 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
131 self
.proxies
= proxies
132 self
.key_file
= x509
.get('key_file')
133 self
.cert_file
= x509
.get('cert_file')
134 self
.addheaders
= [('User-Agent', self
.version
)]
135 self
.__tempfiles
= []
136 self
.__unlink
= os
.unlink
# See cleanup()
137 self
.tempcache
= None
138 # Undocumented feature: if you assign {} to tempcache,
139 # it is used to cache files retrieved with
140 # self.retrieve(). This is not enabled by default
141 # since it does not work for changing documents (and I
142 # haven't got the logic to check expiration headers
144 self
.ftpcache
= ftpcache
145 # Undocumented feature: you can use a different
146 # ftp cache by assigning to the .ftpcache member;
147 # in case you want logically independent URL openers
148 # XXX This is not threadsafe. Bah.
157 # This code sometimes runs when the rest of this module
158 # has already been deleted, so it can't use any globals
159 # or import anything.
161 for file in self
.__tempfiles
:
166 del self
.__tempfiles
[:]
168 self
.tempcache
.clear()
170 def addheader(self
, *args
):
171 """Add a header to be used by the HTTP interface only
172 e.g. u.addheader('Accept', 'sound/basic')"""
173 self
.addheaders
.append(args
)
176 def open(self
, fullurl
, data
=None):
177 """Use URLopener().open(file) instead of open(file, 'r')."""
178 fullurl
= unwrap(toBytes(fullurl
))
179 # percent encode url, fixing lame server errors for e.g, like space
181 fullurl
= quote(fullurl
, safe
="%/:=&?~#+!$,;'@()*[]")
182 if self
.tempcache
and fullurl
in self
.tempcache
:
183 filename
, headers
= self
.tempcache
[fullurl
]
184 fp
= open(filename
, 'rb')
185 return addinfourl(fp
, headers
, fullurl
)
186 urltype
, url
= splittype(fullurl
)
189 if urltype
in self
.proxies
:
190 proxy
= self
.proxies
[urltype
]
191 urltype
, proxyhost
= splittype(proxy
)
192 host
, selector
= splithost(proxyhost
)
193 url
= (host
, fullurl
) # Signal special case to open_*()
196 name
= 'open_' + urltype
198 name
= name
.replace('-', '_')
199 if not hasattr(self
, name
):
201 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
203 return self
.open_unknown(fullurl
, data
)
206 return getattr(self
, name
)(url
)
208 return getattr(self
, name
)(url
, data
)
209 except socket
.error
, msg
:
210 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
212 def open_unknown(self
, fullurl
, data
=None):
213 """Overridable interface to open unknown URL type."""
214 type, url
= splittype(fullurl
)
215 raise IOError, ('url error', 'unknown url type', type)
217 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
218 """Overridable interface to open unknown URL type."""
219 type, url
= splittype(fullurl
)
220 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
223 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
224 """retrieve(url) returns (filename, headers) for a local object
225 or (tempfilename, headers) for a remote object."""
226 url
= unwrap(toBytes(url
))
227 if self
.tempcache
and url
in self
.tempcache
:
228 return self
.tempcache
[url
]
229 type, url1
= splittype(url
)
230 if filename
is None and (not type or type == 'file'):
232 fp
= self
.open_local_file(url1
)
235 return url2pathname(splithost(url1
)[1]), hdrs
238 fp
= self
.open(url
, data
)
242 tfp
= open(filename
, 'wb')
245 garbage
, path
= splittype(url
)
246 garbage
, path
= splithost(path
or "")
247 path
, garbage
= splitquery(path
or "")
248 path
, garbage
= splitattr(path
or "")
249 suffix
= os
.path
.splitext(path
)[1]
250 (fd
, filename
) = tempfile
.mkstemp(suffix
)
251 self
.__tempfiles
.append(filename
)
252 tfp
= os
.fdopen(fd
, 'wb')
254 result
= filename
, headers
255 if self
.tempcache
is not None:
256 self
.tempcache
[url
] = result
262 if "content-length" in headers
:
263 size
= int(headers
["Content-Length"])
264 reporthook(blocknum
, bs
, size
)
273 reporthook(blocknum
, bs
, size
)
281 # raise exception if actual size does not match content-length header
282 if size
>= 0 and read
< size
:
283 raise ContentTooShortError("retrieval incomplete: got only %i out "
284 "of %i bytes" % (read
, size
), result
)
288 # Each method named open_<type> knows how to open that type of URL
290 def open_http(self
, url
, data
=None):
291 """Use HTTP protocol."""
295 if isinstance(url
, str):
296 host
, selector
= splithost(url
)
298 user_passwd
, host
= splituser(host
)
303 # check whether the proxy contains authorization information
304 proxy_passwd
, host
= splituser(host
)
305 # now we proceed with the url we want to obtain
306 urltype
, rest
= splittype(selector
)
309 if urltype
.lower() != 'http':
312 realhost
, rest
= splithost(rest
)
314 user_passwd
, realhost
= splituser(realhost
)
316 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
317 if proxy_bypass(realhost
):
320 #print "proxy via http:", host, selector
321 if not host
: raise IOError, ('http error', 'no host given')
325 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
331 auth
= base64
.b64encode(user_passwd
).strip()
334 h
= httplib
.HTTP(host
)
336 h
.putrequest('POST', selector
)
337 h
.putheader('Content-Type', 'application/x-www-form-urlencoded')
338 h
.putheader('Content-Length', '%d' % len(data
))
340 h
.putrequest('GET', selector
)
341 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
342 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
343 if realhost
: h
.putheader('Host', realhost
)
344 for args
in self
.addheaders
: h
.putheader(*args
)
346 errcode
, errmsg
, headers
= h
.getreply()
350 # something went wrong with the HTTP status line
351 raise IOError, ('http protocol error', 0,
352 'got a bad status line', None)
353 # According to RFC 2616, "2xx" code indicates that the client's
354 # request was successfully received, understood, and accepted.
355 if (200 <= errcode
< 300):
356 return addinfourl(fp
, headers
, "http:" + url
, errcode
)
359 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
361 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
363 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
364 """Handle http errors.
365 Derived class can override this, or provide specific handlers
366 named http_error_DDD where DDD is the 3-digit error code."""
367 # First check if there's a specific handler for this error
368 name
= 'http_error_%d' % errcode
369 if hasattr(self
, name
):
370 method
= getattr(self
, name
)
372 result
= method(url
, fp
, errcode
, errmsg
, headers
)
374 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
375 if result
: return result
376 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
378 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
379 """Default error handler: close the connection and raise IOError."""
382 raise IOError, ('http error', errcode
, errmsg
, headers
)
385 def open_https(self
, url
, data
=None):
386 """Use HTTPS protocol."""
391 if isinstance(url
, str):
392 host
, selector
= splithost(url
)
394 user_passwd
, host
= splituser(host
)
399 # here, we determine, whether the proxy contains authorization information
400 proxy_passwd
, host
= splituser(host
)
401 urltype
, rest
= splittype(selector
)
404 if urltype
.lower() != 'https':
407 realhost
, rest
= splithost(rest
)
409 user_passwd
, realhost
= splituser(realhost
)
411 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
412 #print "proxy via https:", host, selector
413 if not host
: raise IOError, ('https error', 'no host given')
416 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
421 auth
= base64
.b64encode(user_passwd
).strip()
424 h
= httplib
.HTTPS(host
, 0,
425 key_file
=self
.key_file
,
426 cert_file
=self
.cert_file
)
428 h
.putrequest('POST', selector
)
429 h
.putheader('Content-Type',
430 'application/x-www-form-urlencoded')
431 h
.putheader('Content-Length', '%d' % len(data
))
433 h
.putrequest('GET', selector
)
434 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
435 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
436 if realhost
: h
.putheader('Host', realhost
)
437 for args
in self
.addheaders
: h
.putheader(*args
)
439 errcode
, errmsg
, headers
= h
.getreply()
443 # something went wrong with the HTTP status line
444 raise IOError, ('http protocol error', 0,
445 'got a bad status line', None)
446 # According to RFC 2616, "2xx" code indicates that the client's
447 # request was successfully received, understood, and accepted.
448 if (200 <= errcode
< 300):
449 return addinfourl(fp
, headers
, "https:" + url
, errcode
)
452 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
454 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
457 def open_file(self
, url
):
458 """Use local file or FTP depending on form of URL."""
459 if not isinstance(url
, str):
460 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
461 if url
[:2] == '//' and url
[2:3] != '/' and url
[2:12].lower() != 'localhost/':
462 return self
.open_ftp(url
)
464 return self
.open_local_file(url
)
466 def open_local_file(self
, url
):
467 """Use local file."""
468 import mimetypes
, mimetools
, email
.utils
470 from cStringIO
import StringIO
472 from StringIO
import StringIO
473 host
, file = splithost(url
)
474 localname
= url2pathname(file)
476 stats
= os
.stat(localname
)
478 raise IOError(e
.errno
, e
.strerror
, e
.filename
)
480 modified
= email
.utils
.formatdate(stats
.st_mtime
, usegmt
=True)
481 mtype
= mimetypes
.guess_type(url
)[0]
482 headers
= mimetools
.Message(StringIO(
483 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
484 (mtype
or 'text/plain', size
, modified
)))
488 urlfile
= 'file://' + file
489 return addinfourl(open(localname
, 'rb'),
491 host
, port
= splitport(host
)
493 and socket
.gethostbyname(host
) in (localhost(), thishost()):
496 urlfile
= 'file://' + file
497 return addinfourl(open(localname
, 'rb'),
499 raise IOError, ('local file error', 'not on local host')
501 def open_ftp(self
, url
):
502 """Use FTP protocol."""
503 if not isinstance(url
, str):
504 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
505 import mimetypes
, mimetools
507 from cStringIO
import StringIO
509 from StringIO
import StringIO
510 host
, path
= splithost(url
)
511 if not host
: raise IOError, ('ftp error', 'no host given')
512 host
, port
= splitport(host
)
513 user
, host
= splituser(host
)
514 if user
: user
, passwd
= splitpasswd(user
)
517 user
= unquote(user
or '')
518 passwd
= unquote(passwd
or '')
519 host
= socket
.gethostbyname(host
)
522 port
= ftplib
.FTP_PORT
525 path
, attrs
= splitattr(path
)
527 dirs
= path
.split('/')
528 dirs
, file = dirs
[:-1], dirs
[-1]
529 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
530 if dirs
and not dirs
[0]: dirs
[0] = '/'
531 key
= user
, host
, port
, '/'.join(dirs
)
533 if len(self
.ftpcache
) > MAXFTPCACHE
:
534 # Prune the cache, rather arbitrarily
535 for k
in self
.ftpcache
.keys():
541 if not key
in self
.ftpcache
:
542 self
.ftpcache
[key
] = \
543 ftpwrapper(user
, passwd
, host
, port
, dirs
)
544 if not file: type = 'D'
547 attr
, value
= splitvalue(attr
)
548 if attr
.lower() == 'type' and \
549 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
551 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
552 mtype
= mimetypes
.guess_type("ftp:" + url
)[0]
555 headers
+= "Content-Type: %s\n" % mtype
556 if retrlen
is not None and retrlen
>= 0:
557 headers
+= "Content-Length: %d\n" % retrlen
558 headers
= mimetools
.Message(StringIO(headers
))
559 return addinfourl(fp
, headers
, "ftp:" + url
)
560 except ftperrors(), msg
:
561 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
563 def open_data(self
, url
, data
=None):
564 """Use "data" URL."""
565 if not isinstance(url
, str):
566 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
569 # syntax of data URLs:
570 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
571 # mediatype := [ type "/" subtype ] *( ";" parameter )
573 # parameter := attribute "=" value
576 from cStringIO
import StringIO
578 from StringIO
import StringIO
580 [type, data
] = url
.split(',', 1)
582 raise IOError, ('data error', 'bad data URL')
584 type = 'text/plain;charset=US-ASCII'
585 semi
= type.rfind(';')
586 if semi
>= 0 and '=' not in type[semi
:]:
587 encoding
= type[semi
+1:]
592 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
593 time
.gmtime(time
.time())))
594 msg
.append('Content-type: %s' % type)
595 if encoding
== 'base64':
597 data
= base64
.decodestring(data
)
600 msg
.append('Content-Length: %d' % len(data
))
605 headers
= mimetools
.Message(f
, 0)
606 #f.fileno = None # needed for addinfourl
607 return addinfourl(f
, headers
, url
)
610 class FancyURLopener(URLopener
):
611 """Derived class with handlers for errors we can handle (perhaps)."""
613 def __init__(self
, *args
, **kwargs
):
614 URLopener
.__init
__(self
, *args
, **kwargs
)
619 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
620 """Default error handling -- don't raise an exception."""
621 return addinfourl(fp
, headers
, "http:" + url
, errcode
)
623 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
624 """Error 302 -- relocated (temporarily)."""
626 if self
.maxtries
and self
.tries
>= self
.maxtries
:
627 if hasattr(self
, "http_error_500"):
628 meth
= self
.http_error_500
630 meth
= self
.http_error_default
632 return meth(url
, fp
, 500,
633 "Internal Server Error: Redirect Recursion", headers
)
634 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
639 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
640 if 'location' in headers
:
641 newurl
= headers
['location']
642 elif 'uri' in headers
:
643 newurl
= headers
['uri']
648 # In case the server sent a relative URL, join with original:
649 newurl
= basejoin(self
.type + ":" + url
, newurl
)
650 return self
.open(newurl
)
652 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
653 """Error 301 -- also relocated (permanently)."""
654 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
656 def http_error_303(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
657 """Error 303 -- also relocated (essentially identical to 302)."""
658 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
660 def http_error_307(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
661 """Error 307 -- relocated, but turn POST into error."""
663 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
665 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
667 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
668 """Error 401 -- authentication required.
669 This function supports Basic authentication only."""
670 if not 'www-authenticate' in headers
:
671 URLopener
.http_error_default(self
, url
, fp
,
672 errcode
, errmsg
, headers
)
673 stuff
= headers
['www-authenticate']
675 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
677 URLopener
.http_error_default(self
, url
, fp
,
678 errcode
, errmsg
, headers
)
679 scheme
, realm
= match
.groups()
680 if scheme
.lower() != 'basic':
681 URLopener
.http_error_default(self
, url
, fp
,
682 errcode
, errmsg
, headers
)
683 name
= 'retry_' + self
.type + '_basic_auth'
685 return getattr(self
,name
)(url
, realm
)
687 return getattr(self
,name
)(url
, realm
, data
)
689 def http_error_407(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
690 """Error 407 -- proxy authentication required.
691 This function supports Basic authentication only."""
692 if not 'proxy-authenticate' in headers
:
693 URLopener
.http_error_default(self
, url
, fp
,
694 errcode
, errmsg
, headers
)
695 stuff
= headers
['proxy-authenticate']
697 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
699 URLopener
.http_error_default(self
, url
, fp
,
700 errcode
, errmsg
, headers
)
701 scheme
, realm
= match
.groups()
702 if scheme
.lower() != 'basic':
703 URLopener
.http_error_default(self
, url
, fp
,
704 errcode
, errmsg
, headers
)
705 name
= 'retry_proxy_' + self
.type + '_basic_auth'
707 return getattr(self
,name
)(url
, realm
)
709 return getattr(self
,name
)(url
, realm
, data
)
711 def retry_proxy_http_basic_auth(self
, url
, realm
, data
=None):
712 host
, selector
= splithost(url
)
713 newurl
= 'http://' + host
+ selector
714 proxy
= self
.proxies
['http']
715 urltype
, proxyhost
= splittype(proxy
)
716 proxyhost
, proxyselector
= splithost(proxyhost
)
717 i
= proxyhost
.find('@') + 1
718 proxyhost
= proxyhost
[i
:]
719 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
720 if not (user
or passwd
): return None
721 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
722 self
.proxies
['http'] = 'http://' + proxyhost
+ proxyselector
724 return self
.open(newurl
)
726 return self
.open(newurl
, data
)
728 def retry_proxy_https_basic_auth(self
, url
, realm
, data
=None):
729 host
, selector
= splithost(url
)
730 newurl
= 'https://' + host
+ selector
731 proxy
= self
.proxies
['https']
732 urltype
, proxyhost
= splittype(proxy
)
733 proxyhost
, proxyselector
= splithost(proxyhost
)
734 i
= proxyhost
.find('@') + 1
735 proxyhost
= proxyhost
[i
:]
736 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
737 if not (user
or passwd
): return None
738 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
739 self
.proxies
['https'] = 'https://' + proxyhost
+ proxyselector
741 return self
.open(newurl
)
743 return self
.open(newurl
, data
)
745 def retry_http_basic_auth(self
, url
, realm
, data
=None):
746 host
, selector
= splithost(url
)
747 i
= host
.find('@') + 1
749 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
750 if not (user
or passwd
): return None
751 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
752 newurl
= 'http://' + host
+ selector
754 return self
.open(newurl
)
756 return self
.open(newurl
, data
)
758 def retry_https_basic_auth(self
, url
, realm
, data
=None):
759 host
, selector
= splithost(url
)
760 i
= host
.find('@') + 1
762 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
763 if not (user
or passwd
): return None
764 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
765 newurl
= 'https://' + host
+ selector
767 return self
.open(newurl
)
769 return self
.open(newurl
, data
)
771 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
772 key
= realm
+ '@' + host
.lower()
773 if key
in self
.auth_cache
:
775 del self
.auth_cache
[key
]
777 return self
.auth_cache
[key
]
778 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
779 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
782 def prompt_user_passwd(self
, host
, realm
):
783 """Override this in a GUI environment!"""
786 user
= raw_input("Enter username for %s at %s: " % (realm
,
788 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
791 except KeyboardInterrupt:
800 """Return the IP address of the magic hostname 'localhost'."""
802 if _localhost
is None:
803 _localhost
= socket
.gethostbyname('localhost')
808 """Return the IP address of the current host."""
810 if _thishost
is None:
811 _thishost
= socket
.gethostbyname(socket
.gethostname())
816 """Return the set of errors raised by the FTP class."""
818 if _ftperrors
is None:
820 _ftperrors
= ftplib
.all_errors
825 """Return an empty mimetools.Message object."""
827 if _noheaders
is None:
830 from cStringIO
import StringIO
832 from StringIO
import StringIO
833 _noheaders
= mimetools
.Message(StringIO(), 0)
834 _noheaders
.fp
.close() # Recycle file descriptor
841 """Class used by open_ftp() for cache of open FTP connections."""
843 def __init__(self
, user
, passwd
, host
, port
, dirs
,
844 timeout
=socket
._GLOBAL
_DEFAULT
_TIMEOUT
):
850 self
.timeout
= timeout
856 self
.ftp
= ftplib
.FTP()
857 self
.ftp
.connect(self
.host
, self
.port
, self
.timeout
)
858 self
.ftp
.login(self
.user
, self
.passwd
)
859 for dir in self
.dirs
:
862 def retrfile(self
, file, type):
865 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
866 else: cmd
= 'TYPE ' + type; isdir
= 0
868 self
.ftp
.voidcmd(cmd
)
869 except ftplib
.all_errors
:
871 self
.ftp
.voidcmd(cmd
)
873 if file and not isdir
:
874 # Try to retrieve as a file
877 conn
= self
.ftp
.ntransfercmd(cmd
)
878 except ftplib
.error_perm
, reason
:
879 if str(reason
)[:3] != '550':
880 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
882 # Set transfer mode to ASCII!
883 self
.ftp
.voidcmd('TYPE A')
884 # Try a directory listing. Verify that directory exists.
890 except ftplib
.error_perm
, reason
:
891 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
897 conn
= self
.ftp
.ntransfercmd(cmd
)
899 # Pass back both a suitably decorated object and a retrieval length
900 return (addclosehook(conn
[0].makefile('rb'),
901 self
.endtransfer
), conn
[1])
902 def endtransfer(self
):
919 """Base class for addinfo and addclosehook."""
921 def __init__(self
, fp
):
923 self
.read
= self
.fp
.read
924 self
.readline
= self
.fp
.readline
925 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
926 if hasattr(self
.fp
, "fileno"):
927 self
.fileno
= self
.fp
.fileno
929 self
.fileno
= lambda: None
930 if hasattr(self
.fp
, "__iter__"):
931 self
.__iter
__ = self
.fp
.__iter
__
932 if hasattr(self
.fp
, "next"):
933 self
.next
= self
.fp
.next
936 return '<%s at %r whose fp = %r>' % (self
.__class
__.__name
__,
942 self
.readlines
= None
944 if self
.fp
: self
.fp
.close()
947 class addclosehook(addbase
):
948 """Class to add a close hook to an open file."""
950 def __init__(self
, fp
, closehook
, *hookargs
):
951 addbase
.__init
__(self
, fp
)
952 self
.closehook
= closehook
953 self
.hookargs
= hookargs
958 self
.closehook(*self
.hookargs
)
959 self
.closehook
= None
962 class addinfo(addbase
):
963 """class to add an info() method to an open file."""
965 def __init__(self
, fp
, headers
):
966 addbase
.__init
__(self
, fp
)
967 self
.headers
= headers
972 class addinfourl(addbase
):
973 """class to add info() and geturl() methods to an open file."""
975 def __init__(self
, fp
, headers
, url
, code
=None):
976 addbase
.__init
__(self
, fp
)
977 self
.headers
= headers
991 # Utilities to parse URLs (most of these return None for missing parts):
992 # unwrap('<URL:type://host/path>') --> 'type://host/path'
993 # splittype('type:opaquestring') --> 'type', 'opaquestring'
994 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
995 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
996 # splitpasswd('user:passwd') -> 'user', 'passwd'
997 # splitport('host:port') --> 'host', 'port'
998 # splitquery('/path?query') --> '/path', 'query'
999 # splittag('/path#tag') --> '/path', 'tag'
1000 # splitattr('/path;attr1=value1;attr2=value2;...') ->
1001 # '/path', ['attr1=value1', 'attr2=value2', ...]
1002 # splitvalue('attr=value') --> 'attr', 'value'
1003 # unquote('abc%20def') -> 'abc def'
1004 # quote('abc def') -> 'abc%20def')
1013 return isinstance(x
, unicode)
1016 """toBytes(u"URL") --> 'URL'."""
1017 # Most URL schemes require ASCII. If that changes, the conversion
1019 if _is_unicode(url
):
1021 url
= url
.encode("ASCII")
1022 except UnicodeError:
1023 raise UnicodeError("URL " + repr(url
) +
1024 " contains non-ASCII characters")
1028 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1030 if url
[:1] == '<' and url
[-1:] == '>':
1031 url
= url
[1:-1].strip()
1032 if url
[:4] == 'URL:': url
= url
[4:].strip()
1037 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1039 if _typeprog
is None:
1041 _typeprog
= re
.compile('^([^/:]+):')
1043 match
= _typeprog
.match(url
)
1045 scheme
= match
.group(1)
1046 return scheme
.lower(), url
[len(scheme
) + 1:]
1051 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1053 if _hostprog
is None:
1055 _hostprog
= re
.compile('^//([^/?]*)(.*)$')
1057 match
= _hostprog
.match(url
)
1058 if match
: return match
.group(1, 2)
1062 def splituser(host
):
1063 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1065 if _userprog
is None:
1067 _userprog
= re
.compile('^(.*)@(.*)$')
1069 match
= _userprog
.match(host
)
1070 if match
: return map(unquote
, match
.group(1, 2))
1074 def splitpasswd(user
):
1075 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1077 if _passwdprog
is None:
1079 _passwdprog
= re
.compile('^([^:]*):(.*)$',re
.S
)
1081 match
= _passwdprog
.match(user
)
1082 if match
: return match
.group(1, 2)
1085 # splittag('/path#tag') --> '/path', 'tag'
1087 def splitport(host
):
1088 """splitport('host:port') --> 'host', 'port'."""
1090 if _portprog
is None:
1092 _portprog
= re
.compile('^(.*):([0-9]+)$')
1094 match
= _portprog
.match(host
)
1095 if match
: return match
.group(1, 2)
1099 def splitnport(host
, defport
=-1):
1100 """Split host and port, returning numeric port.
1101 Return given default port if no ':' found; defaults to -1.
1102 Return numerical port if a valid number are found after ':'.
1103 Return None if ':' but not a valid number."""
1105 if _nportprog
is None:
1107 _nportprog
= re
.compile('^(.*):(.*)$')
1109 match
= _nportprog
.match(host
)
1111 host
, port
= match
.group(1, 2)
1113 if not port
: raise ValueError, "no digits"
1118 return host
, defport
1121 def splitquery(url
):
1122 """splitquery('/path?query') --> '/path', 'query'."""
1124 if _queryprog
is None:
1126 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1128 match
= _queryprog
.match(url
)
1129 if match
: return match
.group(1, 2)
1134 """splittag('/path#tag') --> '/path', 'tag'."""
1136 if _tagprog
is None:
1138 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1140 match
= _tagprog
.match(url
)
1141 if match
: return match
.group(1, 2)
1145 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1146 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1147 words
= url
.split(';')
1148 return words
[0], words
[1:]
1151 def splitvalue(attr
):
1152 """splitvalue('attr=value') --> 'attr', 'value'."""
1154 if _valueprog
is None:
1156 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1158 match
= _valueprog
.match(attr
)
1159 if match
: return match
.group(1, 2)
1162 _hextochr
= dict(('%02x' % i
, chr(i
)) for i
in range(256))
1163 _hextochr
.update(('%02X' % i
, chr(i
)) for i
in range(256))
1166 """unquote('abc%20def') -> 'abc def'."""
1168 for i
in xrange(1, len(res
)):
1171 res
[i
] = _hextochr
[item
[:2]] + item
[2:]
1174 except UnicodeDecodeError:
1175 res
[i
] = unichr(int(item
[:2], 16)) + item
[2:]
1178 def unquote_plus(s
):
1179 """unquote('%7e/abc+def') -> '~/abc def'"""
1180 s
= s
.replace('+', ' ')
1183 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1184 'abcdefghijklmnopqrstuvwxyz'
1188 def quote(s
, safe
= '/'):
1189 """quote('abc def') -> 'abc%20def'
1191 Each part of a URL, e.g. the path info, the query, etc., has a
1192 different set of reserved characters that must be quoted.
1194 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1195 the following reserved characters.
1197 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1200 Each of these characters is reserved in some component of a URL,
1201 but not necessarily in all of them.
1203 By default, the quote function is intended for quoting the path
1204 section of a URL. Thus, it will not encode '/'. This character
1205 is reserved, but in typical usage the quote function is being
1206 called on a path where the existing slash characters are used as
1207 reserved characters.
1209 cachekey
= (safe
, always_safe
)
1211 safe_map
= _safemaps
[cachekey
]
1215 for i
in range(256):
1217 safe_map
[c
] = (c
in safe
) and c
or ('%%%02X' % i
)
1218 _safemaps
[cachekey
] = safe_map
1219 res
= map(safe_map
.__getitem
__, s
)
1222 def quote_plus(s
, safe
= ''):
1223 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1225 s
= quote(s
, safe
+ ' ')
1226 return s
.replace(' ', '+')
1227 return quote(s
, safe
)
1229 def urlencode(query
,doseq
=0):
1230 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1232 If any values in the query arg are sequences and doseq is true, each
1233 sequence element is converted to a separate parameter.
1235 If the query arg is a sequence of two-element tuples, the order of the
1236 parameters in the output will match the order of parameters in the
1240 if hasattr(query
,"items"):
1242 query
= query
.items()
1244 # it's a bother at times that strings and string-like objects are
1247 # non-sequence items should not work with len()
1248 # non-empty strings will fail this
1249 if len(query
) and not isinstance(query
[0], tuple):
1251 # zero-length sequences of all types will get here and succeed,
1252 # but that's a minor nit - since the original implementation
1253 # allowed empty dicts that type of behavior probably should be
1254 # preserved for consistency
1256 ty
,va
,tb
= sys
.exc_info()
1257 raise TypeError, "not a valid non-string sequence or mapping object", tb
1261 # preserve old behavior
1263 k
= quote_plus(str(k
))
1264 v
= quote_plus(str(v
))
1265 l
.append(k
+ '=' + v
)
1268 k
= quote_plus(str(k
))
1269 if isinstance(v
, str):
1271 l
.append(k
+ '=' + v
)
1272 elif _is_unicode(v
):
1273 # is there a reasonable way to convert to ASCII?
1274 # encode generates a string, but "replace" or "ignore"
1275 # lose information and "strict" can raise UnicodeError
1276 v
= quote_plus(v
.encode("ASCII","replace"))
1277 l
.append(k
+ '=' + v
)
1280 # is this a sufficient test for sequence-ness?
1284 v
= quote_plus(str(v
))
1285 l
.append(k
+ '=' + v
)
1287 # loop over the sequence
1289 l
.append(k
+ '=' + quote_plus(str(elt
)))
1293 def getproxies_environment():
1294 """Return a dictionary of scheme -> proxy server URL mappings.
1296 Scan the environment for variables named <scheme>_proxy;
1297 this seems to be the standard convention. If you need a
1298 different way, you can pass a proxies dictionary to the
1299 [Fancy]URLopener constructor.
1303 for name
, value
in os
.environ
.items():
1305 if value
and name
[-6:] == '_proxy':
1306 proxies
[name
[:-6]] = value
1309 def proxy_bypass_environment(host
):
1310 """Test if proxies should not be used for a particular host.
1312 Checks the environment for a variable named no_proxy, which should
1313 be a list of DNS suffixes separated by commas, or '*' for all hosts.
1315 no_proxy
= os
.environ
.get('no_proxy', '') or os
.environ
.get('NO_PROXY', '')
1316 # '*' is special case for always bypass
1319 # strip port off host
1320 hostonly
, port
= splitport(host
)
1321 # check if the host ends with any of the DNS suffixes
1322 for name
in no_proxy
.split(','):
1323 if name
and (hostonly
.endswith(name
) or host
.endswith(name
)):
1325 # otherwise, don't bypass
1329 if sys
.platform
== 'darwin':
1330 from _scproxy
import _get_proxy_settings
, _get_proxies
1332 def proxy_bypass_macosx_sysconf(host
):
1334 Return True iff this host shouldn't be accessed using a proxy
1336 This function uses the MacOSX framework SystemConfiguration
1337 to fetch the proxy information.
1341 from fnmatch
import fnmatch
1343 hostonly
, port
= splitport(host
)
1346 parts
= ipAddr
.split('.')
1347 parts
= map(int, parts
)
1349 parts
= (parts
+ [0, 0, 0, 0])[:4]
1350 return (parts
[0] << 24) |
(parts
[1] << 16) |
(parts
[2] << 8) | parts
[3]
1352 proxy_settings
= _get_proxy_settings()
1354 # Check for simple host names:
1356 if proxy_settings
['exclude_simple']:
1361 for value
in proxy_settings
.get('exceptions', ()):
1362 # Items in the list are strings like these: *.local, 169.254/16
1363 if not value
: continue
1365 m
= re
.match(r
"(\d+(?:\.\d+)*)(/\d+)?", value
)
1369 hostIP
= socket
.gethostbyname(hostonly
)
1370 hostIP
= ip2num(hostIP
)
1371 except socket
.error
:
1374 base
= ip2num(m
.group(1))
1375 mask
= int(m
.group(2)[1:])
1378 if (hostIP
>> mask
) == (base
>> mask
):
1381 elif fnmatch(host
, value
):
1387 def getproxies_macosx_sysconf():
1388 """Return a dictionary of scheme -> proxy server URL mappings.
1390 This function uses the MacOSX framework SystemConfiguration
1391 to fetch the proxy information.
1393 return _get_proxies()
1397 def proxy_bypass(host
):
1398 if getproxies_environment():
1399 return proxy_bypass_environment(host
)
1401 return proxy_bypass_macosx_sysconf(host
)
1404 return getproxies_environment() or getproxies_macosx_sysconf()
1406 elif os
.name
== 'nt':
1407 def getproxies_registry():
1408 """Return a dictionary of scheme -> proxy server URL mappings.
1410 Win32 uses the registry to store proxies.
1417 # Std module, so should be around - but you never know!
1420 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1421 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1422 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1425 # Returned as Unicode but problems if not converted to ASCII
1426 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1428 if '=' in proxyServer
:
1429 # Per-protocol settings
1430 for p
in proxyServer
.split(';'):
1431 protocol
, address
= p
.split('=', 1)
1432 # See if address has a type:// prefix
1434 if not re
.match('^([^/:]+)://', address
):
1435 address
= '%s://%s' % (protocol
, address
)
1436 proxies
[protocol
] = address
1438 # Use one setting for all protocols
1439 if proxyServer
[:5] == 'http:':
1440 proxies
['http'] = proxyServer
1442 proxies
['http'] = 'http://%s' % proxyServer
1443 proxies
['ftp'] = 'ftp://%s' % proxyServer
1444 internetSettings
.Close()
1445 except (WindowsError, ValueError, TypeError):
1446 # Either registry key not found etc, or the value in an
1447 # unexpected format.
1448 # proxies already set up to be empty so nothing to do
1453 """Return a dictionary of scheme -> proxy server URL mappings.
1455 Returns settings gathered from the environment, if specified,
1459 return getproxies_environment() or getproxies_registry()
1461 def proxy_bypass_registry(host
):
1466 # Std modules, so should be around - but you never know!
1469 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1470 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1471 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1473 proxyOverride
= str(_winreg
.QueryValueEx(internetSettings
,
1474 'ProxyOverride')[0])
1475 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1476 except WindowsError:
1478 if not proxyEnable
or not proxyOverride
:
1480 # try to make a host list from name and IP address.
1481 rawHost
, port
= splitport(host
)
1484 addr
= socket
.gethostbyname(rawHost
)
1487 except socket
.error
:
1490 fqdn
= socket
.getfqdn(rawHost
)
1493 except socket
.error
:
1495 # make a check value list from the registry entry: replace the
1496 # '<local>' string by the localhost entry and the corresponding
1498 proxyOverride
= proxyOverride
.split(';')
1499 # now check if we match one of the registry values.
1500 for test
in proxyOverride
:
1501 if test
== '<local>':
1502 if '.' not in rawHost
:
1504 test
= test
.replace(".", r
"\.") # mask dots
1505 test
= test
.replace("*", r
".*") # change glob sequence
1506 test
= test
.replace("?", r
".") # change glob char
1508 # print "%s <--> %s" %( test, val )
1509 if re
.match(test
, val
, re
.I
):
1513 def proxy_bypass(host
):
1514 """Return a dictionary of scheme -> proxy server URL mappings.
1516 Returns settings gathered from the environment, if specified,
1520 if getproxies_environment():
1521 return proxy_bypass_environment(host
)
1523 return proxy_bypass_registry(host
)
1526 # By default use environment variables
1527 getproxies
= getproxies_environment
1528 proxy_bypass
= proxy_bypass_environment
1530 # Test and time quote() and unquote()
1533 for i
in range(256): s
= s
+ chr(i
)
1544 print round(t1
- t0
, 3), 'sec'
1547 def reporthook(blocknum
, blocksize
, totalsize
):
1548 # Report during remote transfers
1549 print "Block number: %d, Block size: %d, Total size: %d" % (
1550 blocknum
, blocksize
, totalsize
)
1558 'file://localhost/etc/passwd',
1559 'ftp://ftp.gnu.org/pub/README',
1560 'http://www.python.org/index.html',
1562 if hasattr(URLopener
, "open_https"):
1563 args
.append('https://synergy.as.cmu.edu/~geek/')
1566 print '-'*10, url
, '-'*10
1567 fn
, h
= urlretrieve(url
, None, reporthook
)
1571 for k
in h
.keys(): print k
+ ':', h
[k
]
1577 table
= string
.maketrans("", "")
1578 data
= data
.translate(table
, "\r")
1588 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1589 except getopt
.error
, msg
:
1591 print "Use -h for help"
1598 print "Usage: python urllib.py [-t] [url ...]"
1599 print "-t runs self-test;",
1600 print "otherwise, contents of urls are printed"
1608 print "Use -h for help"
1610 print urlopen(url
).read(),
1612 # Run test program when run as a script
1613 if __name__
== '__main__':