1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
30 from urlparse
import urljoin
as basejoin
32 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
40 __version__
= '1.17' # XXX This version is not always updated :-(
42 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
44 # Helper for non-unix systems
46 from macurl2path
import url2pathname
, pathname2url
48 from nturl2path
import url2pathname
, pathname2url
49 elif os
.name
== 'riscos':
50 from rourl2path
import url2pathname
, pathname2url
52 def url2pathname(pathname
):
53 """OS-specific conversion from a relative URL of the 'file' scheme
54 to a file system path; not recommended for general use."""
55 return unquote(pathname
)
57 def pathname2url(pathname
):
58 """OS-specific conversion from a file system path to a relative URL
59 of the 'file' scheme; not recommended for general use."""
60 return quote(pathname
)
62 # This really consists of two pieces:
63 # (1) a class which handles opening of all sorts of URLs
64 # (plus assorted utilities etc.)
65 # (2) a set of functions for parsing URLs
66 # XXX Should these be separated out into different modules?
69 # Shortcut for basic usage
71 def urlopen(url
, data
=None, proxies
=None):
72 """urlopen(url [, data]) -> open file-like object"""
74 if proxies
is not None:
75 opener
= FancyURLopener(proxies
=proxies
)
77 opener
= FancyURLopener()
82 return opener
.open(url
)
84 return opener
.open(url
, data
)
85 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
88 _urlopener
= FancyURLopener()
89 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
94 # exception raised when downloaded size does not match content-length
95 class ContentTooShortError(IOError):
96 def __init__(self
, message
, content
):
97 IOError.__init
__(self
, message
)
98 self
.content
= content
102 """Class to open URLs.
103 This is a class rather than just a subroutine because we may need
104 more than one set of global protocol-specific options.
105 Note -- this is a base class for those who don't want the
106 automatic handling of errors type 302 (relocated) and 401
107 (authorization needed)."""
111 version
= "Python-urllib/%s" % __version__
114 def __init__(self
, proxies
=None, **x509
):
116 proxies
= getproxies()
117 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
118 self
.proxies
= proxies
119 self
.key_file
= x509
.get('key_file')
120 self
.cert_file
= x509
.get('cert_file')
121 self
.addheaders
= [('User-Agent', self
.version
)]
122 self
.__tempfiles
= []
123 self
.__unlink
= os
.unlink
# See cleanup()
124 self
.tempcache
= None
125 # Undocumented feature: if you assign {} to tempcache,
126 # it is used to cache files retrieved with
127 # self.retrieve(). This is not enabled by default
128 # since it does not work for changing documents (and I
129 # haven't got the logic to check expiration headers
131 self
.ftpcache
= ftpcache
132 # Undocumented feature: you can use a different
133 # ftp cache by assigning to the .ftpcache member;
134 # in case you want logically independent URL openers
135 # XXX This is not threadsafe. Bah.
144 # This code sometimes runs when the rest of this module
145 # has already been deleted, so it can't use any globals
146 # or import anything.
148 for file in self
.__tempfiles
:
153 del self
.__tempfiles
[:]
155 self
.tempcache
.clear()
157 def addheader(self
, *args
):
158 """Add a header to be used by the HTTP interface only
159 e.g. u.addheader('Accept', 'sound/basic')"""
160 self
.addheaders
.append(args
)
163 def open(self
, fullurl
, data
=None):
164 """Use URLopener().open(file) instead of open(file, 'r')."""
165 fullurl
= unwrap(toBytes(fullurl
))
166 if self
.tempcache
and fullurl
in self
.tempcache
:
167 filename
, headers
= self
.tempcache
[fullurl
]
168 fp
= open(filename
, 'rb')
169 return addinfourl(fp
, headers
, fullurl
)
170 urltype
, url
= splittype(fullurl
)
173 if urltype
in self
.proxies
:
174 proxy
= self
.proxies
[urltype
]
175 urltype
, proxyhost
= splittype(proxy
)
176 host
, selector
= splithost(proxyhost
)
177 url
= (host
, fullurl
) # Signal special case to open_*()
180 name
= 'open_' + urltype
182 name
= name
.replace('-', '_')
183 if not hasattr(self
, name
):
185 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
187 return self
.open_unknown(fullurl
, data
)
190 return getattr(self
, name
)(url
)
192 return getattr(self
, name
)(url
, data
)
193 except socket
.error
, msg
:
194 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
196 def open_unknown(self
, fullurl
, data
=None):
197 """Overridable interface to open unknown URL type."""
198 type, url
= splittype(fullurl
)
199 raise IOError, ('url error', 'unknown url type', type)
201 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
202 """Overridable interface to open unknown URL type."""
203 type, url
= splittype(fullurl
)
204 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
207 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
208 """retrieve(url) returns (filename, headers) for a local object
209 or (tempfilename, headers) for a remote object."""
210 url
= unwrap(toBytes(url
))
211 if self
.tempcache
and url
in self
.tempcache
:
212 return self
.tempcache
[url
]
213 type, url1
= splittype(url
)
214 if filename
is None and (not type or type == 'file'):
216 fp
= self
.open_local_file(url1
)
219 return url2pathname(splithost(url1
)[1]), hdrs
222 fp
= self
.open(url
, data
)
225 tfp
= open(filename
, 'wb')
228 garbage
, path
= splittype(url
)
229 garbage
, path
= splithost(path
or "")
230 path
, garbage
= splitquery(path
or "")
231 path
, garbage
= splitattr(path
or "")
232 suffix
= os
.path
.splitext(path
)[1]
233 (fd
, filename
) = tempfile
.mkstemp(suffix
)
234 self
.__tempfiles
.append(filename
)
235 tfp
= os
.fdopen(fd
, 'wb')
236 result
= filename
, headers
237 if self
.tempcache
is not None:
238 self
.tempcache
[url
] = result
244 if "content-length" in headers
:
245 size
= int(headers
["Content-Length"])
246 reporthook(blocknum
, bs
, size
)
255 reporthook(blocknum
, bs
, size
)
261 # raise exception if actual size does not match content-length header
262 if size
>= 0 and read
< size
:
263 raise ContentTooShortError("retrieval incomplete: got only %i out "
264 "of %i bytes" % (read
, size
), result
)
268 # Each method named open_<type> knows how to open that type of URL
270 def open_http(self
, url
, data
=None):
271 """Use HTTP protocol."""
275 if isinstance(url
, str):
276 host
, selector
= splithost(url
)
278 user_passwd
, host
= splituser(host
)
283 # check whether the proxy contains authorization information
284 proxy_passwd
, host
= splituser(host
)
285 # now we proceed with the url we want to obtain
286 urltype
, rest
= splittype(selector
)
289 if urltype
.lower() != 'http':
292 realhost
, rest
= splithost(rest
)
294 user_passwd
, realhost
= splituser(realhost
)
296 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
297 if proxy_bypass(realhost
):
300 #print "proxy via http:", host, selector
301 if not host
: raise IOError, ('http error', 'no host given')
305 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
311 auth
= base64
.b64encode(user_passwd
).strip()
314 h
= httplib
.HTTP(host
)
316 h
.putrequest('POST', selector
)
317 h
.putheader('Content-Type', 'application/x-www-form-urlencoded')
318 h
.putheader('Content-Length', '%d' % len(data
))
320 h
.putrequest('GET', selector
)
321 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
322 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
323 if realhost
: h
.putheader('Host', realhost
)
324 for args
in self
.addheaders
: h
.putheader(*args
)
328 errcode
, errmsg
, headers
= h
.getreply()
332 # something went wrong with the HTTP status line
333 raise IOError, ('http protocol error', 0,
334 'got a bad status line', None)
336 return addinfourl(fp
, headers
, "http:" + url
)
339 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
341 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
343 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
344 """Handle http errors.
345 Derived class can override this, or provide specific handlers
346 named http_error_DDD where DDD is the 3-digit error code."""
347 # First check if there's a specific handler for this error
348 name
= 'http_error_%d' % errcode
349 if hasattr(self
, name
):
350 method
= getattr(self
, name
)
352 result
= method(url
, fp
, errcode
, errmsg
, headers
)
354 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
355 if result
: return result
356 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
358 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
359 """Default error handler: close the connection and raise IOError."""
362 raise IOError, ('http error', errcode
, errmsg
, headers
)
364 if hasattr(socket
, "ssl"):
365 def open_https(self
, url
, data
=None):
366 """Use HTTPS protocol."""
370 if isinstance(url
, str):
371 host
, selector
= splithost(url
)
373 user_passwd
, host
= splituser(host
)
378 # here, we determine, whether the proxy contains authorization information
379 proxy_passwd
, host
= splituser(host
)
380 urltype
, rest
= splittype(selector
)
383 if urltype
.lower() != 'https':
386 realhost
, rest
= splithost(rest
)
388 user_passwd
, realhost
= splituser(realhost
)
390 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
391 #print "proxy via https:", host, selector
392 if not host
: raise IOError, ('https error', 'no host given')
395 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
400 auth
= base64
.b64encode(user_passwd
).strip()
403 h
= httplib
.HTTPS(host
, 0,
404 key_file
=self
.key_file
,
405 cert_file
=self
.cert_file
)
407 h
.putrequest('POST', selector
)
408 h
.putheader('Content-Type',
409 'application/x-www-form-urlencoded')
410 h
.putheader('Content-Length', '%d' % len(data
))
412 h
.putrequest('GET', selector
)
413 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
414 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
415 if realhost
: h
.putheader('Host', realhost
)
416 for args
in self
.addheaders
: h
.putheader(*args
)
420 errcode
, errmsg
, headers
= h
.getreply()
424 # something went wrong with the HTTP status line
425 raise IOError, ('http protocol error', 0,
426 'got a bad status line', None)
428 return addinfourl(fp
, headers
, "https:" + url
)
431 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
433 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
436 def open_file(self
, url
):
437 """Use local file or FTP depending on form of URL."""
438 if not isinstance(url
, str):
439 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
440 if url
[:2] == '//' and url
[2:3] != '/' and url
[2:12].lower() != 'localhost/':
441 return self
.open_ftp(url
)
443 return self
.open_local_file(url
)
445 def open_local_file(self
, url
):
446 """Use local file."""
447 import mimetypes
, mimetools
, email
.utils
449 from cStringIO
import StringIO
451 from StringIO
import StringIO
452 host
, file = splithost(url
)
453 localname
= url2pathname(file)
455 stats
= os
.stat(localname
)
457 raise IOError(e
.errno
, e
.strerror
, e
.filename
)
459 modified
= email
.utils
.formatdate(stats
.st_mtime
, usegmt
=True)
460 mtype
= mimetypes
.guess_type(url
)[0]
461 headers
= mimetools
.Message(StringIO(
462 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
463 (mtype
or 'text/plain', size
, modified
)))
467 urlfile
= 'file://' + file
468 return addinfourl(open(localname
, 'rb'),
470 host
, port
= splitport(host
)
472 and socket
.gethostbyname(host
) in (localhost(), thishost()):
475 urlfile
= 'file://' + file
476 return addinfourl(open(localname
, 'rb'),
478 raise IOError, ('local file error', 'not on local host')
480 def open_ftp(self
, url
):
481 """Use FTP protocol."""
482 if not isinstance(url
, str):
483 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
484 import mimetypes
, mimetools
486 from cStringIO
import StringIO
488 from StringIO
import StringIO
489 host
, path
= splithost(url
)
490 if not host
: raise IOError, ('ftp error', 'no host given')
491 host
, port
= splitport(host
)
492 user
, host
= splituser(host
)
493 if user
: user
, passwd
= splitpasswd(user
)
496 user
= unquote(user
or '')
497 passwd
= unquote(passwd
or '')
498 host
= socket
.gethostbyname(host
)
501 port
= ftplib
.FTP_PORT
504 path
, attrs
= splitattr(path
)
506 dirs
= path
.split('/')
507 dirs
, file = dirs
[:-1], dirs
[-1]
508 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
509 if dirs
and not dirs
[0]: dirs
[0] = '/'
510 key
= user
, host
, port
, '/'.join(dirs
)
512 if len(self
.ftpcache
) > MAXFTPCACHE
:
513 # Prune the cache, rather arbitrarily
514 for k
in self
.ftpcache
.keys():
520 if not key
in self
.ftpcache
:
521 self
.ftpcache
[key
] = \
522 ftpwrapper(user
, passwd
, host
, port
, dirs
)
523 if not file: type = 'D'
526 attr
, value
= splitvalue(attr
)
527 if attr
.lower() == 'type' and \
528 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
530 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
531 mtype
= mimetypes
.guess_type("ftp:" + url
)[0]
534 headers
+= "Content-Type: %s\n" % mtype
535 if retrlen
is not None and retrlen
>= 0:
536 headers
+= "Content-Length: %d\n" % retrlen
537 headers
= mimetools
.Message(StringIO(headers
))
538 return addinfourl(fp
, headers
, "ftp:" + url
)
539 except ftperrors(), msg
:
540 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
542 def open_data(self
, url
, data
=None):
543 """Use "data" URL."""
544 if not isinstance(url
, str):
545 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
548 # syntax of data URLs:
549 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
550 # mediatype := [ type "/" subtype ] *( ";" parameter )
552 # parameter := attribute "=" value
555 from cStringIO
import StringIO
557 from StringIO
import StringIO
559 [type, data
] = url
.split(',', 1)
561 raise IOError, ('data error', 'bad data URL')
563 type = 'text/plain;charset=US-ASCII'
564 semi
= type.rfind(';')
565 if semi
>= 0 and '=' not in type[semi
:]:
566 encoding
= type[semi
+1:]
571 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
572 time
.gmtime(time
.time())))
573 msg
.append('Content-type: %s' % type)
574 if encoding
== 'base64':
576 data
= base64
.decodestring(data
)
579 msg
.append('Content-Length: %d' % len(data
))
584 headers
= mimetools
.Message(f
, 0)
585 #f.fileno = None # needed for addinfourl
586 return addinfourl(f
, headers
, url
)
589 class FancyURLopener(URLopener
):
590 """Derived class with handlers for errors we can handle (perhaps)."""
592 def __init__(self
, *args
, **kwargs
):
593 URLopener
.__init
__(self
, *args
, **kwargs
)
598 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
599 """Default error handling -- don't raise an exception."""
600 return addinfourl(fp
, headers
, "http:" + url
)
602 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
603 """Error 302 -- relocated (temporarily)."""
605 if self
.maxtries
and self
.tries
>= self
.maxtries
:
606 if hasattr(self
, "http_error_500"):
607 meth
= self
.http_error_500
609 meth
= self
.http_error_default
611 return meth(url
, fp
, 500,
612 "Internal Server Error: Redirect Recursion", headers
)
613 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
618 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
619 if 'location' in headers
:
620 newurl
= headers
['location']
621 elif 'uri' in headers
:
622 newurl
= headers
['uri']
627 # In case the server sent a relative URL, join with original:
628 newurl
= basejoin(self
.type + ":" + url
, newurl
)
629 return self
.open(newurl
)
631 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
632 """Error 301 -- also relocated (permanently)."""
633 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
635 def http_error_303(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
636 """Error 303 -- also relocated (essentially identical to 302)."""
637 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
639 def http_error_307(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
640 """Error 307 -- relocated, but turn POST into error."""
642 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
644 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
646 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
647 """Error 401 -- authentication required.
648 This function supports Basic authentication only."""
649 if not 'www-authenticate' in headers
:
650 URLopener
.http_error_default(self
, url
, fp
,
651 errcode
, errmsg
, headers
)
652 stuff
= headers
['www-authenticate']
654 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
656 URLopener
.http_error_default(self
, url
, fp
,
657 errcode
, errmsg
, headers
)
658 scheme
, realm
= match
.groups()
659 if scheme
.lower() != 'basic':
660 URLopener
.http_error_default(self
, url
, fp
,
661 errcode
, errmsg
, headers
)
662 name
= 'retry_' + self
.type + '_basic_auth'
664 return getattr(self
,name
)(url
, realm
)
666 return getattr(self
,name
)(url
, realm
, data
)
668 def http_error_407(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
669 """Error 407 -- proxy authentication required.
670 This function supports Basic authentication only."""
671 if not 'proxy-authenticate' in headers
:
672 URLopener
.http_error_default(self
, url
, fp
,
673 errcode
, errmsg
, headers
)
674 stuff
= headers
['proxy-authenticate']
676 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
678 URLopener
.http_error_default(self
, url
, fp
,
679 errcode
, errmsg
, headers
)
680 scheme
, realm
= match
.groups()
681 if scheme
.lower() != 'basic':
682 URLopener
.http_error_default(self
, url
, fp
,
683 errcode
, errmsg
, headers
)
684 name
= 'retry_proxy_' + self
.type + '_basic_auth'
686 return getattr(self
,name
)(url
, realm
)
688 return getattr(self
,name
)(url
, realm
, data
)
690 def retry_proxy_http_basic_auth(self
, url
, realm
, data
=None):
691 host
, selector
= splithost(url
)
692 newurl
= 'http://' + host
+ selector
693 proxy
= self
.proxies
['http']
694 urltype
, proxyhost
= splittype(proxy
)
695 proxyhost
, proxyselector
= splithost(proxyhost
)
696 i
= proxyhost
.find('@') + 1
697 proxyhost
= proxyhost
[i
:]
698 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
699 if not (user
or passwd
): return None
700 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
701 self
.proxies
['http'] = 'http://' + proxyhost
+ proxyselector
703 return self
.open(newurl
)
705 return self
.open(newurl
, data
)
707 def retry_proxy_https_basic_auth(self
, url
, realm
, data
=None):
708 host
, selector
= splithost(url
)
709 newurl
= 'https://' + host
+ selector
710 proxy
= self
.proxies
['https']
711 urltype
, proxyhost
= splittype(proxy
)
712 proxyhost
, proxyselector
= splithost(proxyhost
)
713 i
= proxyhost
.find('@') + 1
714 proxyhost
= proxyhost
[i
:]
715 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
716 if not (user
or passwd
): return None
717 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
718 self
.proxies
['https'] = 'https://' + proxyhost
+ proxyselector
720 return self
.open(newurl
)
722 return self
.open(newurl
, data
)
724 def retry_http_basic_auth(self
, url
, realm
, data
=None):
725 host
, selector
= splithost(url
)
726 i
= host
.find('@') + 1
728 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
729 if not (user
or passwd
): return None
730 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
731 newurl
= 'http://' + host
+ selector
733 return self
.open(newurl
)
735 return self
.open(newurl
, data
)
737 def retry_https_basic_auth(self
, url
, realm
, data
=None):
738 host
, selector
= splithost(url
)
739 i
= host
.find('@') + 1
741 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
742 if not (user
or passwd
): return None
743 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
744 newurl
= 'https://' + host
+ selector
746 return self
.open(newurl
)
748 return self
.open(newurl
, data
)
750 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
751 key
= realm
+ '@' + host
.lower()
752 if key
in self
.auth_cache
:
754 del self
.auth_cache
[key
]
756 return self
.auth_cache
[key
]
757 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
758 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
761 def prompt_user_passwd(self
, host
, realm
):
762 """Override this in a GUI environment!"""
765 user
= raw_input("Enter username for %s at %s: " % (realm
,
767 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
770 except KeyboardInterrupt:
779 """Return the IP address of the magic hostname 'localhost'."""
781 if _localhost
is None:
782 _localhost
= socket
.gethostbyname('localhost')
787 """Return the IP address of the current host."""
789 if _thishost
is None:
790 _thishost
= socket
.gethostbyname(socket
.gethostname())
795 """Return the set of errors raised by the FTP class."""
797 if _ftperrors
is None:
799 _ftperrors
= ftplib
.all_errors
804 """Return an empty mimetools.Message object."""
806 if _noheaders
is None:
809 from cStringIO
import StringIO
811 from StringIO
import StringIO
812 _noheaders
= mimetools
.Message(StringIO(), 0)
813 _noheaders
.fp
.close() # Recycle file descriptor
820 """Class used by open_ftp() for cache of open FTP connections."""
822 def __init__(self
, user
, passwd
, host
, port
, dirs
):
833 self
.ftp
= ftplib
.FTP()
834 self
.ftp
.connect(self
.host
, self
.port
)
835 self
.ftp
.login(self
.user
, self
.passwd
)
836 for dir in self
.dirs
:
839 def retrfile(self
, file, type):
842 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
843 else: cmd
= 'TYPE ' + type; isdir
= 0
845 self
.ftp
.voidcmd(cmd
)
846 except ftplib
.all_errors
:
848 self
.ftp
.voidcmd(cmd
)
850 if file and not isdir
:
851 # Try to retrieve as a file
854 conn
= self
.ftp
.ntransfercmd(cmd
)
855 except ftplib
.error_perm
, reason
:
856 if str(reason
)[:3] != '550':
857 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
859 # Set transfer mode to ASCII!
860 self
.ftp
.voidcmd('TYPE A')
861 # Try a directory listing
862 if file: cmd
= 'LIST ' + file
864 conn
= self
.ftp
.ntransfercmd(cmd
)
866 # Pass back both a suitably decorated object and a retrieval length
867 return (addclosehook(conn
[0].makefile('rb'),
868 self
.endtransfer
), conn
[1])
869 def endtransfer(self
):
886 """Base class for addinfo and addclosehook."""
888 def __init__(self
, fp
):
890 self
.read
= self
.fp
.read
891 self
.readline
= self
.fp
.readline
892 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
893 if hasattr(self
.fp
, "fileno"):
894 self
.fileno
= self
.fp
.fileno
896 self
.fileno
= lambda: None
897 if hasattr(self
.fp
, "__iter__"):
898 self
.__iter
__ = self
.fp
.__iter
__
899 if hasattr(self
.fp
, "next"):
900 self
.next
= self
.fp
.next
903 return '<%s at %r whose fp = %r>' % (self
.__class
__.__name
__,
909 self
.readlines
= None
911 if self
.fp
: self
.fp
.close()
914 class addclosehook(addbase
):
915 """Class to add a close hook to an open file."""
917 def __init__(self
, fp
, closehook
, *hookargs
):
918 addbase
.__init
__(self
, fp
)
919 self
.closehook
= closehook
920 self
.hookargs
= hookargs
925 self
.closehook(*self
.hookargs
)
926 self
.closehook
= None
929 class addinfo(addbase
):
930 """class to add an info() method to an open file."""
932 def __init__(self
, fp
, headers
):
933 addbase
.__init
__(self
, fp
)
934 self
.headers
= headers
939 class addinfourl(addbase
):
940 """class to add info() and geturl() methods to an open file."""
942 def __init__(self
, fp
, headers
, url
):
943 addbase
.__init
__(self
, fp
)
944 self
.headers
= headers
954 # Utilities to parse URLs (most of these return None for missing parts):
955 # unwrap('<URL:type://host/path>') --> 'type://host/path'
956 # splittype('type:opaquestring') --> 'type', 'opaquestring'
957 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
958 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
959 # splitpasswd('user:passwd') -> 'user', 'passwd'
960 # splitport('host:port') --> 'host', 'port'
961 # splitquery('/path?query') --> '/path', 'query'
962 # splittag('/path#tag') --> '/path', 'tag'
963 # splitattr('/path;attr1=value1;attr2=value2;...') ->
964 # '/path', ['attr1=value1', 'attr2=value2', ...]
965 # splitvalue('attr=value') --> 'attr', 'value'
966 # unquote('abc%20def') -> 'abc def'
967 # quote('abc def') -> 'abc%20def')
976 return isinstance(x
, unicode)
979 """toBytes(u"URL") --> 'URL'."""
980 # Most URL schemes require ASCII. If that changes, the conversion
984 url
= url
.encode("ASCII")
986 raise UnicodeError("URL " + repr(url
) +
987 " contains non-ASCII characters")
991 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
993 if url
[:1] == '<' and url
[-1:] == '>':
994 url
= url
[1:-1].strip()
995 if url
[:4] == 'URL:': url
= url
[4:].strip()
1000 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1002 if _typeprog
is None:
1004 _typeprog
= re
.compile('^([^/:]+):')
1006 match
= _typeprog
.match(url
)
1008 scheme
= match
.group(1)
1009 return scheme
.lower(), url
[len(scheme
) + 1:]
1014 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1016 if _hostprog
is None:
1018 _hostprog
= re
.compile('^//([^/?]*)(.*)$')
1020 match
= _hostprog
.match(url
)
1021 if match
: return match
.group(1, 2)
1025 def splituser(host
):
1026 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1028 if _userprog
is None:
1030 _userprog
= re
.compile('^(.*)@(.*)$')
1032 match
= _userprog
.match(host
)
1033 if match
: return map(unquote
, match
.group(1, 2))
1037 def splitpasswd(user
):
1038 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1040 if _passwdprog
is None:
1042 _passwdprog
= re
.compile('^([^:]*):(.*)$')
1044 match
= _passwdprog
.match(user
)
1045 if match
: return match
.group(1, 2)
1048 # splittag('/path#tag') --> '/path', 'tag'
1050 def splitport(host
):
1051 """splitport('host:port') --> 'host', 'port'."""
1053 if _portprog
is None:
1055 _portprog
= re
.compile('^(.*):([0-9]+)$')
1057 match
= _portprog
.match(host
)
1058 if match
: return match
.group(1, 2)
1062 def splitnport(host
, defport
=-1):
1063 """Split host and port, returning numeric port.
1064 Return given default port if no ':' found; defaults to -1.
1065 Return numerical port if a valid number are found after ':'.
1066 Return None if ':' but not a valid number."""
1068 if _nportprog
is None:
1070 _nportprog
= re
.compile('^(.*):(.*)$')
1072 match
= _nportprog
.match(host
)
1074 host
, port
= match
.group(1, 2)
1076 if not port
: raise ValueError, "no digits"
1081 return host
, defport
1084 def splitquery(url
):
1085 """splitquery('/path?query') --> '/path', 'query'."""
1087 if _queryprog
is None:
1089 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1091 match
= _queryprog
.match(url
)
1092 if match
: return match
.group(1, 2)
1097 """splittag('/path#tag') --> '/path', 'tag'."""
1099 if _tagprog
is None:
1101 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1103 match
= _tagprog
.match(url
)
1104 if match
: return match
.group(1, 2)
1108 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1109 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1110 words
= url
.split(';')
1111 return words
[0], words
[1:]
1114 def splitvalue(attr
):
1115 """splitvalue('attr=value') --> 'attr', 'value'."""
1117 if _valueprog
is None:
1119 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1121 match
= _valueprog
.match(attr
)
1122 if match
: return match
.group(1, 2)
1125 _hextochr
= dict(('%02x' % i
, chr(i
)) for i
in range(256))
1126 _hextochr
.update(('%02X' % i
, chr(i
)) for i
in range(256))
1129 """unquote('abc%20def') -> 'abc def'."""
1131 for i
in xrange(1, len(res
)):
1134 res
[i
] = _hextochr
[item
[:2]] + item
[2:]
1137 except UnicodeDecodeError:
1138 res
[i
] = unichr(int(item
[:2], 16)) + item
[2:]
1141 def unquote_plus(s
):
1142 """unquote('%7e/abc+def') -> '~/abc def'"""
1143 s
= s
.replace('+', ' ')
1146 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1147 'abcdefghijklmnopqrstuvwxyz'
1151 def quote(s
, safe
= '/'):
1152 """quote('abc def') -> 'abc%20def'
1154 Each part of a URL, e.g. the path info, the query, etc., has a
1155 different set of reserved characters that must be quoted.
1157 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1158 the following reserved characters.
1160 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1163 Each of these characters is reserved in some component of a URL,
1164 but not necessarily in all of them.
1166 By default, the quote function is intended for quoting the path
1167 section of a URL. Thus, it will not encode '/'. This character
1168 is reserved, but in typical usage the quote function is being
1169 called on a path where the existing slash characters are used as
1170 reserved characters.
1172 cachekey
= (safe
, always_safe
)
1174 safe_map
= _safemaps
[cachekey
]
1178 for i
in range(256):
1180 safe_map
[c
] = (c
in safe
) and c
or ('%%%02X' % i
)
1181 _safemaps
[cachekey
] = safe_map
1182 res
= map(safe_map
.__getitem
__, s
)
1185 def quote_plus(s
, safe
= ''):
1186 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1188 s
= quote(s
, safe
+ ' ')
1189 return s
.replace(' ', '+')
1190 return quote(s
, safe
)
1192 def urlencode(query
,doseq
=0):
1193 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1195 If any values in the query arg are sequences and doseq is true, each
1196 sequence element is converted to a separate parameter.
1198 If the query arg is a sequence of two-element tuples, the order of the
1199 parameters in the output will match the order of parameters in the
1203 if hasattr(query
,"items"):
1205 query
= query
.items()
1207 # it's a bother at times that strings and string-like objects are
1210 # non-sequence items should not work with len()
1211 # non-empty strings will fail this
1212 if len(query
) and not isinstance(query
[0], tuple):
1214 # zero-length sequences of all types will get here and succeed,
1215 # but that's a minor nit - since the original implementation
1216 # allowed empty dicts that type of behavior probably should be
1217 # preserved for consistency
1219 ty
,va
,tb
= sys
.exc_info()
1220 raise TypeError, "not a valid non-string sequence or mapping object", tb
1224 # preserve old behavior
1226 k
= quote_plus(str(k
))
1227 v
= quote_plus(str(v
))
1228 l
.append(k
+ '=' + v
)
1231 k
= quote_plus(str(k
))
1232 if isinstance(v
, str):
1234 l
.append(k
+ '=' + v
)
1235 elif _is_unicode(v
):
1236 # is there a reasonable way to convert to ASCII?
1237 # encode generates a string, but "replace" or "ignore"
1238 # lose information and "strict" can raise UnicodeError
1239 v
= quote_plus(v
.encode("ASCII","replace"))
1240 l
.append(k
+ '=' + v
)
1243 # is this a sufficient test for sequence-ness?
1247 v
= quote_plus(str(v
))
1248 l
.append(k
+ '=' + v
)
1250 # loop over the sequence
1252 l
.append(k
+ '=' + quote_plus(str(elt
)))
1256 def getproxies_environment():
1257 """Return a dictionary of scheme -> proxy server URL mappings.
1259 Scan the environment for variables named <scheme>_proxy;
1260 this seems to be the standard convention. If you need a
1261 different way, you can pass a proxies dictionary to the
1262 [Fancy]URLopener constructor.
1266 for name
, value
in os
.environ
.items():
1268 if value
and name
[-6:] == '_proxy':
1269 proxies
[name
[:-6]] = value
1272 if sys
.platform
== 'darwin':
1273 def getproxies_internetconfig():
1274 """Return a dictionary of scheme -> proxy server URL mappings.
1276 By convention the mac uses Internet Config to store
1277 proxies. An HTTP proxy, for instance, is stored under
1292 if 'UseHTTPProxy' in config
and config
['UseHTTPProxy']:
1294 value
= config
['HTTPProxyHost']
1298 proxies
['http'] = 'http://%s' % value
1299 # FTP: XXXX To be done.
1300 # Gopher: XXXX To be done.
1303 def proxy_bypass(x
):
1307 return getproxies_environment() or getproxies_internetconfig()
1309 elif os
.name
== 'nt':
1310 def getproxies_registry():
1311 """Return a dictionary of scheme -> proxy server URL mappings.
1313 Win32 uses the registry to store proxies.
1320 # Std module, so should be around - but you never know!
1323 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1324 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1325 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1328 # Returned as Unicode but problems if not converted to ASCII
1329 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1331 if '=' in proxyServer
:
1332 # Per-protocol settings
1333 for p
in proxyServer
.split(';'):
1334 protocol
, address
= p
.split('=', 1)
1335 # See if address has a type:// prefix
1337 if not re
.match('^([^/:]+)://', address
):
1338 address
= '%s://%s' % (protocol
, address
)
1339 proxies
[protocol
] = address
1341 # Use one setting for all protocols
1342 if proxyServer
[:5] == 'http:':
1343 proxies
['http'] = proxyServer
1345 proxies
['http'] = 'http://%s' % proxyServer
1346 proxies
['ftp'] = 'ftp://%s' % proxyServer
1347 internetSettings
.Close()
1348 except (WindowsError, ValueError, TypeError):
1349 # Either registry key not found etc, or the value in an
1350 # unexpected format.
1351 # proxies already set up to be empty so nothing to do
1356 """Return a dictionary of scheme -> proxy server URL mappings.
1358 Returns settings gathered from the environment, if specified,
1362 return getproxies_environment() or getproxies_registry()
1364 def proxy_bypass(host
):
1369 # Std modules, so should be around - but you never know!
1372 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1373 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1374 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1376 proxyOverride
= str(_winreg
.QueryValueEx(internetSettings
,
1377 'ProxyOverride')[0])
1378 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1379 except WindowsError:
1381 if not proxyEnable
or not proxyOverride
:
1383 # try to make a host list from name and IP address.
1384 rawHost
, port
= splitport(host
)
1387 addr
= socket
.gethostbyname(rawHost
)
1390 except socket
.error
:
1393 fqdn
= socket
.getfqdn(rawHost
)
1396 except socket
.error
:
1398 # make a check value list from the registry entry: replace the
1399 # '<local>' string by the localhost entry and the corresponding
1401 proxyOverride
= proxyOverride
.split(';')
1403 while i
< len(proxyOverride
):
1404 if proxyOverride
[i
] == '<local>':
1405 proxyOverride
[i
:i
+1] = ['localhost',
1407 socket
.gethostname(),
1408 socket
.gethostbyname(
1409 socket
.gethostname())]
1411 # print proxyOverride
1412 # now check if we match one of the registry values.
1413 for test
in proxyOverride
:
1414 test
= test
.replace(".", r
"\.") # mask dots
1415 test
= test
.replace("*", r
".*") # change glob sequence
1416 test
= test
.replace("?", r
".") # change glob char
1418 # print "%s <--> %s" %( test, val )
1419 if re
.match(test
, val
, re
.I
):
1424 # By default use environment variables
1425 getproxies
= getproxies_environment
1427 def proxy_bypass(host
):
1430 # Test and time quote() and unquote()
1433 for i
in range(256): s
= s
+ chr(i
)
1444 print round(t1
- t0
, 3), 'sec'
1447 def reporthook(blocknum
, blocksize
, totalsize
):
1448 # Report during remote transfers
1449 print "Block number: %d, Block size: %d, Total size: %d" % (
1450 blocknum
, blocksize
, totalsize
)
1458 'file://localhost/etc/passwd',
1459 'ftp://ftp.gnu.org/pub/README',
1460 'http://www.python.org/index.html',
1462 if hasattr(URLopener
, "open_https"):
1463 args
.append('https://synergy.as.cmu.edu/~geek/')
1466 print '-'*10, url
, '-'*10
1467 fn
, h
= urlretrieve(url
, None, reporthook
)
1471 for k
in h
.keys(): print k
+ ':', h
[k
]
1477 table
= string
.maketrans("", "")
1478 data
= data
.translate(table
, "\r")
1488 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1489 except getopt
.error
, msg
:
1491 print "Use -h for help"
1498 print "Usage: python urllib.py [-t] [url ...]"
1499 print "-t runs self-test;",
1500 print "otherwise, contents of urls are printed"
1508 print "Use -h for help"
1510 print urlopen(url
).read(),
1512 # Run test program when run as a script
1513 if __name__
== '__main__':