1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 The object returned by URLopener().open(file) will differ per
17 protocol. All you know is that is has methods read(), readline(),
18 readlines(), fileno(), close() and info(). The read*(), fileno()
19 and close() methods work like those of open files.
20 The info() method returns a mimetools.Message object which can be
21 used to query various info about the object, if available.
22 (mimetools.Message objects are queried with the getheader() method.)
30 from urlparse
import urljoin
as basejoin
32 __all__
= ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
40 __version__
= '1.17' # XXX This version is not always updated :-(
42 MAXFTPCACHE
= 10 # Trim the ftp cache beyond this size
44 # Helper for non-unix systems
46 from macurl2path
import url2pathname
, pathname2url
48 from nturl2path
import url2pathname
, pathname2url
49 elif os
.name
== 'riscos':
50 from rourl2path
import url2pathname
, pathname2url
52 def url2pathname(pathname
):
53 """OS-specific conversion from a relative URL of the 'file' scheme
54 to a file system path; not recommended for general use."""
55 return unquote(pathname
)
57 def pathname2url(pathname
):
58 """OS-specific conversion from a file system path to a relative URL
59 of the 'file' scheme; not recommended for general use."""
60 return quote(pathname
)
62 # This really consists of two pieces:
63 # (1) a class which handles opening of all sorts of URLs
64 # (plus assorted utilities etc.)
65 # (2) a set of functions for parsing URLs
66 # XXX Should these be separated out into different modules?
69 # Shortcut for basic usage
71 def urlopen(url
, data
=None, proxies
=None):
72 """urlopen(url [, data]) -> open file-like object"""
74 if proxies
is not None:
75 opener
= FancyURLopener(proxies
=proxies
)
77 opener
= FancyURLopener()
82 return opener
.open(url
)
84 return opener
.open(url
, data
)
85 def urlretrieve(url
, filename
=None, reporthook
=None, data
=None):
88 _urlopener
= FancyURLopener()
89 return _urlopener
.retrieve(url
, filename
, reporthook
, data
)
102 # exception raised when downloaded size does not match content-length
103 class ContentTooShortError(IOError):
104 def __init__(self
, message
, content
):
105 IOError.__init
__(self
, message
)
106 self
.content
= content
110 """Class to open URLs.
111 This is a class rather than just a subroutine because we may need
112 more than one set of global protocol-specific options.
113 Note -- this is a base class for those who don't want the
114 automatic handling of errors type 302 (relocated) and 401
115 (authorization needed)."""
119 version
= "Python-urllib/%s" % __version__
122 def __init__(self
, proxies
=None, **x509
):
124 proxies
= getproxies()
125 assert hasattr(proxies
, 'has_key'), "proxies must be a mapping"
126 self
.proxies
= proxies
127 self
.key_file
= x509
.get('key_file')
128 self
.cert_file
= x509
.get('cert_file')
129 self
.addheaders
= [('User-Agent', self
.version
)]
130 self
.__tempfiles
= []
131 self
.__unlink
= os
.unlink
# See cleanup()
132 self
.tempcache
= None
133 # Undocumented feature: if you assign {} to tempcache,
134 # it is used to cache files retrieved with
135 # self.retrieve(). This is not enabled by default
136 # since it does not work for changing documents (and I
137 # haven't got the logic to check expiration headers
139 self
.ftpcache
= ftpcache
140 # Undocumented feature: you can use a different
141 # ftp cache by assigning to the .ftpcache member;
142 # in case you want logically independent URL openers
143 # XXX This is not threadsafe. Bah.
152 # This code sometimes runs when the rest of this module
153 # has already been deleted, so it can't use any globals
154 # or import anything.
156 for file in self
.__tempfiles
:
161 del self
.__tempfiles
[:]
163 self
.tempcache
.clear()
165 def addheader(self
, *args
):
166 """Add a header to be used by the HTTP interface only
167 e.g. u.addheader('Accept', 'sound/basic')"""
168 self
.addheaders
.append(args
)
171 def open(self
, fullurl
, data
=None):
172 """Use URLopener().open(file) instead of open(file, 'r')."""
173 fullurl
= unwrap(toBytes(fullurl
))
174 if self
.tempcache
and fullurl
in self
.tempcache
:
175 filename
, headers
= self
.tempcache
[fullurl
]
176 fp
= open(filename
, 'rb')
177 return addinfourl(fp
, headers
, fullurl
)
178 urltype
, url
= splittype(fullurl
)
181 if urltype
in self
.proxies
:
182 proxy
= self
.proxies
[urltype
]
183 urltype
, proxyhost
= splittype(proxy
)
184 host
, selector
= splithost(proxyhost
)
185 url
= (host
, fullurl
) # Signal special case to open_*()
188 name
= 'open_' + urltype
190 name
= name
.replace('-', '_')
191 if not hasattr(self
, name
):
193 return self
.open_unknown_proxy(proxy
, fullurl
, data
)
195 return self
.open_unknown(fullurl
, data
)
198 return getattr(self
, name
)(url
)
200 return getattr(self
, name
)(url
, data
)
201 except socket
.error
, msg
:
202 raise IOError, ('socket error', msg
), sys
.exc_info()[2]
204 def open_unknown(self
, fullurl
, data
=None):
205 """Overridable interface to open unknown URL type."""
206 type, url
= splittype(fullurl
)
207 raise IOError, ('url error', 'unknown url type', type)
209 def open_unknown_proxy(self
, proxy
, fullurl
, data
=None):
210 """Overridable interface to open unknown URL type."""
211 type, url
= splittype(fullurl
)
212 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy
)
215 def retrieve(self
, url
, filename
=None, reporthook
=None, data
=None):
216 """retrieve(url) returns (filename, headers) for a local object
217 or (tempfilename, headers) for a remote object."""
218 url
= unwrap(toBytes(url
))
219 if self
.tempcache
and url
in self
.tempcache
:
220 return self
.tempcache
[url
]
221 type, url1
= splittype(url
)
222 if filename
is None and (not type or type == 'file'):
224 fp
= self
.open_local_file(url1
)
227 return url2pathname(splithost(url1
)[1]), hdrs
230 fp
= self
.open(url
, data
)
233 tfp
= open(filename
, 'wb')
236 garbage
, path
= splittype(url
)
237 garbage
, path
= splithost(path
or "")
238 path
, garbage
= splitquery(path
or "")
239 path
, garbage
= splitattr(path
or "")
240 suffix
= os
.path
.splitext(path
)[1]
241 (fd
, filename
) = tempfile
.mkstemp(suffix
)
242 self
.__tempfiles
.append(filename
)
243 tfp
= os
.fdopen(fd
, 'wb')
244 result
= filename
, headers
245 if self
.tempcache
is not None:
246 self
.tempcache
[url
] = result
252 if "content-length" in headers
:
253 size
= int(headers
["Content-Length"])
254 reporthook(blocknum
, bs
, size
)
263 reporthook(blocknum
, bs
, size
)
269 # raise exception if actual size does not match content-length header
270 if size
>= 0 and read
< size
:
271 raise ContentTooShortError("retrieval incomplete: got only %i out "
272 "of %i bytes" % (read
, size
), result
)
276 # Each method named open_<type> knows how to open that type of URL
278 def open_http(self
, url
, data
=None):
279 """Use HTTP protocol."""
283 if isinstance(url
, str):
284 host
, selector
= splithost(url
)
286 user_passwd
, host
= splituser(host
)
291 # check whether the proxy contains authorization information
292 proxy_passwd
, host
= splituser(host
)
293 # now we proceed with the url we want to obtain
294 urltype
, rest
= splittype(selector
)
297 if urltype
.lower() != 'http':
300 realhost
, rest
= splithost(rest
)
302 user_passwd
, realhost
= splituser(realhost
)
304 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
305 if proxy_bypass(realhost
):
308 #print "proxy via http:", host, selector
309 if not host
: raise IOError, ('http error', 'no host given')
313 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
319 auth
= base64
.b64encode(user_passwd
).strip()
322 h
= httplib
.HTTP(host
)
324 h
.putrequest('POST', selector
)
325 h
.putheader('Content-Type', 'application/x-www-form-urlencoded')
326 h
.putheader('Content-Length', '%d' % len(data
))
328 h
.putrequest('GET', selector
)
329 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
330 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
331 if realhost
: h
.putheader('Host', realhost
)
332 for args
in self
.addheaders
: h
.putheader(*args
)
336 errcode
, errmsg
, headers
= h
.getreply()
340 # something went wrong with the HTTP status line
341 raise IOError, ('http protocol error', 0,
342 'got a bad status line', None)
343 # According to RFC 2616, "2xx" code indicates that the client's
344 # request was successfully received, understood, and accepted.
345 if not (200 <= errcode
< 300):
346 return addinfourl(fp
, headers
, "http:" + url
)
349 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
351 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
, data
)
353 def http_error(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
354 """Handle http errors.
355 Derived class can override this, or provide specific handlers
356 named http_error_DDD where DDD is the 3-digit error code."""
357 # First check if there's a specific handler for this error
358 name
= 'http_error_%d' % errcode
359 if hasattr(self
, name
):
360 method
= getattr(self
, name
)
362 result
= method(url
, fp
, errcode
, errmsg
, headers
)
364 result
= method(url
, fp
, errcode
, errmsg
, headers
, data
)
365 if result
: return result
366 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
368 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
369 """Default error handler: close the connection and raise IOError."""
372 raise IOError, ('http error', errcode
, errmsg
, headers
)
375 def open_https(self
, url
, data
=None):
376 """Use HTTPS protocol."""
381 if isinstance(url
, str):
382 host
, selector
= splithost(url
)
384 user_passwd
, host
= splituser(host
)
389 # here, we determine, whether the proxy contains authorization information
390 proxy_passwd
, host
= splituser(host
)
391 urltype
, rest
= splittype(selector
)
394 if urltype
.lower() != 'https':
397 realhost
, rest
= splithost(rest
)
399 user_passwd
, realhost
= splituser(realhost
)
401 selector
= "%s://%s%s" % (urltype
, realhost
, rest
)
402 #print "proxy via https:", host, selector
403 if not host
: raise IOError, ('https error', 'no host given')
406 proxy_auth
= base64
.b64encode(proxy_passwd
).strip()
411 auth
= base64
.b64encode(user_passwd
).strip()
414 h
= httplib
.HTTPS(host
, 0,
415 key_file
=self
.key_file
,
416 cert_file
=self
.cert_file
)
418 h
.putrequest('POST', selector
)
419 h
.putheader('Content-Type',
420 'application/x-www-form-urlencoded')
421 h
.putheader('Content-Length', '%d' % len(data
))
423 h
.putrequest('GET', selector
)
424 if proxy_auth
: h
.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth
)
425 if auth
: h
.putheader('Authorization', 'Basic %s' % auth
)
426 if realhost
: h
.putheader('Host', realhost
)
427 for args
in self
.addheaders
: h
.putheader(*args
)
431 errcode
, errmsg
, headers
= h
.getreply()
435 # something went wrong with the HTTP status line
436 raise IOError, ('http protocol error', 0,
437 'got a bad status line', None)
438 # According to RFC 2616, "2xx" code indicates that the client's
439 # request was successfully received, understood, and accepted.
440 if not (200 <= errcode
< 300):
441 return addinfourl(fp
, headers
, "https:" + url
)
444 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
)
446 return self
.http_error(url
, fp
, errcode
, errmsg
, headers
,
449 def open_file(self
, url
):
450 """Use local file or FTP depending on form of URL."""
451 if not isinstance(url
, str):
452 raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
453 if url
[:2] == '//' and url
[2:3] != '/' and url
[2:12].lower() != 'localhost/':
454 return self
.open_ftp(url
)
456 return self
.open_local_file(url
)
458 def open_local_file(self
, url
):
459 """Use local file."""
460 import mimetypes
, mimetools
, email
.utils
462 from cStringIO
import StringIO
464 from StringIO
import StringIO
465 host
, file = splithost(url
)
466 localname
= url2pathname(file)
468 stats
= os
.stat(localname
)
470 raise IOError(e
.errno
, e
.strerror
, e
.filename
)
472 modified
= email
.utils
.formatdate(stats
.st_mtime
, usegmt
=True)
473 mtype
= mimetypes
.guess_type(url
)[0]
474 headers
= mimetools
.Message(StringIO(
475 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
476 (mtype
or 'text/plain', size
, modified
)))
480 urlfile
= 'file://' + file
481 return addinfourl(open(localname
, 'rb'),
483 host
, port
= splitport(host
)
485 and socket
.gethostbyname(host
) in (localhost(), thishost()):
488 urlfile
= 'file://' + file
489 return addinfourl(open(localname
, 'rb'),
491 raise IOError, ('local file error', 'not on local host')
493 def open_ftp(self
, url
):
494 """Use FTP protocol."""
495 if not isinstance(url
, str):
496 raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
497 import mimetypes
, mimetools
499 from cStringIO
import StringIO
501 from StringIO
import StringIO
502 host
, path
= splithost(url
)
503 if not host
: raise IOError, ('ftp error', 'no host given')
504 host
, port
= splitport(host
)
505 user
, host
= splituser(host
)
506 if user
: user
, passwd
= splitpasswd(user
)
509 user
= unquote(user
or '')
510 passwd
= unquote(passwd
or '')
511 host
= socket
.gethostbyname(host
)
514 port
= ftplib
.FTP_PORT
517 path
, attrs
= splitattr(path
)
519 dirs
= path
.split('/')
520 dirs
, file = dirs
[:-1], dirs
[-1]
521 if dirs
and not dirs
[0]: dirs
= dirs
[1:]
522 if dirs
and not dirs
[0]: dirs
[0] = '/'
523 key
= user
, host
, port
, '/'.join(dirs
)
525 if len(self
.ftpcache
) > MAXFTPCACHE
:
526 # Prune the cache, rather arbitrarily
527 for k
in self
.ftpcache
.keys():
533 if not key
in self
.ftpcache
:
534 self
.ftpcache
[key
] = \
535 ftpwrapper(user
, passwd
, host
, port
, dirs
)
536 if not file: type = 'D'
539 attr
, value
= splitvalue(attr
)
540 if attr
.lower() == 'type' and \
541 value
in ('a', 'A', 'i', 'I', 'd', 'D'):
543 (fp
, retrlen
) = self
.ftpcache
[key
].retrfile(file, type)
544 mtype
= mimetypes
.guess_type("ftp:" + url
)[0]
547 headers
+= "Content-Type: %s\n" % mtype
548 if retrlen
is not None and retrlen
>= 0:
549 headers
+= "Content-Length: %d\n" % retrlen
550 headers
= mimetools
.Message(StringIO(headers
))
551 return addinfourl(fp
, headers
, "ftp:" + url
)
552 except ftperrors(), msg
:
553 raise IOError, ('ftp error', msg
), sys
.exc_info()[2]
555 def open_data(self
, url
, data
=None):
556 """Use "data" URL."""
557 if not isinstance(url
, str):
558 raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
561 # syntax of data URLs:
562 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
563 # mediatype := [ type "/" subtype ] *( ";" parameter )
565 # parameter := attribute "=" value
568 from cStringIO
import StringIO
570 from StringIO
import StringIO
572 [type, data
] = url
.split(',', 1)
574 raise IOError, ('data error', 'bad data URL')
576 type = 'text/plain;charset=US-ASCII'
577 semi
= type.rfind(';')
578 if semi
>= 0 and '=' not in type[semi
:]:
579 encoding
= type[semi
+1:]
584 msg
.append('Date: %s'%time
.strftime('%a, %d %b %Y %T GMT',
585 time
.gmtime(time
.time())))
586 msg
.append('Content-type: %s' % type)
587 if encoding
== 'base64':
589 data
= base64
.decodestring(data
)
592 msg
.append('Content-Length: %d' % len(data
))
597 headers
= mimetools
.Message(f
, 0)
598 #f.fileno = None # needed for addinfourl
599 return addinfourl(f
, headers
, url
)
602 class FancyURLopener(URLopener
):
603 """Derived class with handlers for errors we can handle (perhaps)."""
605 def __init__(self
, *args
, **kwargs
):
606 URLopener
.__init
__(self
, *args
, **kwargs
)
611 def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
612 """Default error handling -- don't raise an exception."""
613 return addinfourl(fp
, headers
, "http:" + url
)
615 def http_error_302(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
616 """Error 302 -- relocated (temporarily)."""
618 if self
.maxtries
and self
.tries
>= self
.maxtries
:
619 if hasattr(self
, "http_error_500"):
620 meth
= self
.http_error_500
622 meth
= self
.http_error_default
624 return meth(url
, fp
, 500,
625 "Internal Server Error: Redirect Recursion", headers
)
626 result
= self
.redirect_internal(url
, fp
, errcode
, errmsg
, headers
,
631 def redirect_internal(self
, url
, fp
, errcode
, errmsg
, headers
, data
):
632 if 'location' in headers
:
633 newurl
= headers
['location']
634 elif 'uri' in headers
:
635 newurl
= headers
['uri']
640 # In case the server sent a relative URL, join with original:
641 newurl
= basejoin(self
.type + ":" + url
, newurl
)
642 return self
.open(newurl
)
644 def http_error_301(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
645 """Error 301 -- also relocated (permanently)."""
646 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
648 def http_error_303(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
649 """Error 303 -- also relocated (essentially identical to 302)."""
650 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
652 def http_error_307(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
653 """Error 307 -- relocated, but turn POST into error."""
655 return self
.http_error_302(url
, fp
, errcode
, errmsg
, headers
, data
)
657 return self
.http_error_default(url
, fp
, errcode
, errmsg
, headers
)
659 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
660 """Error 401 -- authentication required.
661 This function supports Basic authentication only."""
662 if not 'www-authenticate' in headers
:
663 URLopener
.http_error_default(self
, url
, fp
,
664 errcode
, errmsg
, headers
)
665 stuff
= headers
['www-authenticate']
667 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
669 URLopener
.http_error_default(self
, url
, fp
,
670 errcode
, errmsg
, headers
)
671 scheme
, realm
= match
.groups()
672 if scheme
.lower() != 'basic':
673 URLopener
.http_error_default(self
, url
, fp
,
674 errcode
, errmsg
, headers
)
675 name
= 'retry_' + self
.type + '_basic_auth'
677 return getattr(self
,name
)(url
, realm
)
679 return getattr(self
,name
)(url
, realm
, data
)
681 def http_error_407(self
, url
, fp
, errcode
, errmsg
, headers
, data
=None):
682 """Error 407 -- proxy authentication required.
683 This function supports Basic authentication only."""
684 if not 'proxy-authenticate' in headers
:
685 URLopener
.http_error_default(self
, url
, fp
,
686 errcode
, errmsg
, headers
)
687 stuff
= headers
['proxy-authenticate']
689 match
= re
.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff
)
691 URLopener
.http_error_default(self
, url
, fp
,
692 errcode
, errmsg
, headers
)
693 scheme
, realm
= match
.groups()
694 if scheme
.lower() != 'basic':
695 URLopener
.http_error_default(self
, url
, fp
,
696 errcode
, errmsg
, headers
)
697 name
= 'retry_proxy_' + self
.type + '_basic_auth'
699 return getattr(self
,name
)(url
, realm
)
701 return getattr(self
,name
)(url
, realm
, data
)
703 def retry_proxy_http_basic_auth(self
, url
, realm
, data
=None):
704 host
, selector
= splithost(url
)
705 newurl
= 'http://' + host
+ selector
706 proxy
= self
.proxies
['http']
707 urltype
, proxyhost
= splittype(proxy
)
708 proxyhost
, proxyselector
= splithost(proxyhost
)
709 i
= proxyhost
.find('@') + 1
710 proxyhost
= proxyhost
[i
:]
711 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
712 if not (user
or passwd
): return None
713 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
714 self
.proxies
['http'] = 'http://' + proxyhost
+ proxyselector
716 return self
.open(newurl
)
718 return self
.open(newurl
, data
)
720 def retry_proxy_https_basic_auth(self
, url
, realm
, data
=None):
721 host
, selector
= splithost(url
)
722 newurl
= 'https://' + host
+ selector
723 proxy
= self
.proxies
['https']
724 urltype
, proxyhost
= splittype(proxy
)
725 proxyhost
, proxyselector
= splithost(proxyhost
)
726 i
= proxyhost
.find('@') + 1
727 proxyhost
= proxyhost
[i
:]
728 user
, passwd
= self
.get_user_passwd(proxyhost
, realm
, i
)
729 if not (user
or passwd
): return None
730 proxyhost
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + proxyhost
731 self
.proxies
['https'] = 'https://' + proxyhost
+ proxyselector
733 return self
.open(newurl
)
735 return self
.open(newurl
, data
)
737 def retry_http_basic_auth(self
, url
, realm
, data
=None):
738 host
, selector
= splithost(url
)
739 i
= host
.find('@') + 1
741 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
742 if not (user
or passwd
): return None
743 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
744 newurl
= 'http://' + host
+ selector
746 return self
.open(newurl
)
748 return self
.open(newurl
, data
)
750 def retry_https_basic_auth(self
, url
, realm
, data
=None):
751 host
, selector
= splithost(url
)
752 i
= host
.find('@') + 1
754 user
, passwd
= self
.get_user_passwd(host
, realm
, i
)
755 if not (user
or passwd
): return None
756 host
= quote(user
, safe
='') + ':' + quote(passwd
, safe
='') + '@' + host
757 newurl
= 'https://' + host
+ selector
759 return self
.open(newurl
)
761 return self
.open(newurl
, data
)
763 def get_user_passwd(self
, host
, realm
, clear_cache
= 0):
764 key
= realm
+ '@' + host
.lower()
765 if key
in self
.auth_cache
:
767 del self
.auth_cache
[key
]
769 return self
.auth_cache
[key
]
770 user
, passwd
= self
.prompt_user_passwd(host
, realm
)
771 if user
or passwd
: self
.auth_cache
[key
] = (user
, passwd
)
774 def prompt_user_passwd(self
, host
, realm
):
775 """Override this in a GUI environment!"""
778 user
= raw_input("Enter username for %s at %s: " % (realm
,
780 passwd
= getpass
.getpass("Enter password for %s in %s at %s: " %
783 except KeyboardInterrupt:
792 """Return the IP address of the magic hostname 'localhost'."""
794 if _localhost
is None:
795 _localhost
= socket
.gethostbyname('localhost')
800 """Return the IP address of the current host."""
802 if _thishost
is None:
803 _thishost
= socket
.gethostbyname(socket
.gethostname())
808 """Return the set of errors raised by the FTP class."""
810 if _ftperrors
is None:
812 _ftperrors
= ftplib
.all_errors
817 """Return an empty mimetools.Message object."""
819 if _noheaders
is None:
822 from cStringIO
import StringIO
824 from StringIO
import StringIO
825 _noheaders
= mimetools
.Message(StringIO(), 0)
826 _noheaders
.fp
.close() # Recycle file descriptor
833 """Class used by open_ftp() for cache of open FTP connections."""
835 def __init__(self
, user
, passwd
, host
, port
, dirs
, timeout
=None):
841 self
.timeout
= timeout
847 self
.ftp
= ftplib
.FTP()
848 self
.ftp
.connect(self
.host
, self
.port
, self
.timeout
)
849 self
.ftp
.login(self
.user
, self
.passwd
)
850 for dir in self
.dirs
:
853 def retrfile(self
, file, type):
856 if type in ('d', 'D'): cmd
= 'TYPE A'; isdir
= 1
857 else: cmd
= 'TYPE ' + type; isdir
= 0
859 self
.ftp
.voidcmd(cmd
)
860 except ftplib
.all_errors
:
862 self
.ftp
.voidcmd(cmd
)
864 if file and not isdir
:
865 # Try to retrieve as a file
868 conn
= self
.ftp
.ntransfercmd(cmd
)
869 except ftplib
.error_perm
, reason
:
870 if str(reason
)[:3] != '550':
871 raise IOError, ('ftp error', reason
), sys
.exc_info()[2]
873 # Set transfer mode to ASCII!
874 self
.ftp
.voidcmd('TYPE A')
875 # Try a directory listing
876 if file: cmd
= 'LIST ' + file
878 conn
= self
.ftp
.ntransfercmd(cmd
)
880 # Pass back both a suitably decorated object and a retrieval length
881 return (addclosehook(conn
[0].makefile('rb'),
882 self
.endtransfer
), conn
[1])
883 def endtransfer(self
):
900 """Base class for addinfo and addclosehook."""
902 def __init__(self
, fp
):
904 self
.read
= self
.fp
.read
905 self
.readline
= self
.fp
.readline
906 if hasattr(self
.fp
, "readlines"): self
.readlines
= self
.fp
.readlines
907 if hasattr(self
.fp
, "fileno"):
908 self
.fileno
= self
.fp
.fileno
910 self
.fileno
= lambda: None
911 if hasattr(self
.fp
, "__iter__"):
912 self
.__iter
__ = self
.fp
.__iter
__
913 if hasattr(self
.fp
, "next"):
914 self
.next
= self
.fp
.next
917 return '<%s at %r whose fp = %r>' % (self
.__class
__.__name
__,
923 self
.readlines
= None
925 if self
.fp
: self
.fp
.close()
928 class addclosehook(addbase
):
929 """Class to add a close hook to an open file."""
931 def __init__(self
, fp
, closehook
, *hookargs
):
932 addbase
.__init
__(self
, fp
)
933 self
.closehook
= closehook
934 self
.hookargs
= hookargs
939 self
.closehook(*self
.hookargs
)
940 self
.closehook
= None
943 class addinfo(addbase
):
944 """class to add an info() method to an open file."""
946 def __init__(self
, fp
, headers
):
947 addbase
.__init
__(self
, fp
)
948 self
.headers
= headers
953 class addinfourl(addbase
):
954 """class to add info() and geturl() methods to an open file."""
956 def __init__(self
, fp
, headers
, url
):
957 addbase
.__init
__(self
, fp
)
958 self
.headers
= headers
968 # Utilities to parse URLs (most of these return None for missing parts):
969 # unwrap('<URL:type://host/path>') --> 'type://host/path'
970 # splittype('type:opaquestring') --> 'type', 'opaquestring'
971 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
972 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
973 # splitpasswd('user:passwd') -> 'user', 'passwd'
974 # splitport('host:port') --> 'host', 'port'
975 # splitquery('/path?query') --> '/path', 'query'
976 # splittag('/path#tag') --> '/path', 'tag'
977 # splitattr('/path;attr1=value1;attr2=value2;...') ->
978 # '/path', ['attr1=value1', 'attr2=value2', ...]
979 # splitvalue('attr=value') --> 'attr', 'value'
980 # unquote('abc%20def') -> 'abc def'
981 # quote('abc def') -> 'abc%20def')
990 return isinstance(x
, unicode)
993 """toBytes(u"URL") --> 'URL'."""
994 # Most URL schemes require ASCII. If that changes, the conversion
998 url
= url
.encode("ASCII")
1000 raise UnicodeError("URL " + repr(url
) +
1001 " contains non-ASCII characters")
1005 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
1007 if url
[:1] == '<' and url
[-1:] == '>':
1008 url
= url
[1:-1].strip()
1009 if url
[:4] == 'URL:': url
= url
[4:].strip()
1014 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
1016 if _typeprog
is None:
1018 _typeprog
= re
.compile('^([^/:]+):')
1020 match
= _typeprog
.match(url
)
1022 scheme
= match
.group(1)
1023 return scheme
.lower(), url
[len(scheme
) + 1:]
1028 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
1030 if _hostprog
is None:
1032 _hostprog
= re
.compile('^//([^/?]*)(.*)$')
1034 match
= _hostprog
.match(url
)
1035 if match
: return match
.group(1, 2)
1039 def splituser(host
):
1040 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
1042 if _userprog
is None:
1044 _userprog
= re
.compile('^(.*)@(.*)$')
1046 match
= _userprog
.match(host
)
1047 if match
: return map(unquote
, match
.group(1, 2))
1051 def splitpasswd(user
):
1052 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
1054 if _passwdprog
is None:
1056 _passwdprog
= re
.compile('^([^:]*):(.*)$')
1058 match
= _passwdprog
.match(user
)
1059 if match
: return match
.group(1, 2)
1062 # splittag('/path#tag') --> '/path', 'tag'
1064 def splitport(host
):
1065 """splitport('host:port') --> 'host', 'port'."""
1067 if _portprog
is None:
1069 _portprog
= re
.compile('^(.*):([0-9]+)$')
1071 match
= _portprog
.match(host
)
1072 if match
: return match
.group(1, 2)
1076 def splitnport(host
, defport
=-1):
1077 """Split host and port, returning numeric port.
1078 Return given default port if no ':' found; defaults to -1.
1079 Return numerical port if a valid number are found after ':'.
1080 Return None if ':' but not a valid number."""
1082 if _nportprog
is None:
1084 _nportprog
= re
.compile('^(.*):(.*)$')
1086 match
= _nportprog
.match(host
)
1088 host
, port
= match
.group(1, 2)
1090 if not port
: raise ValueError, "no digits"
1095 return host
, defport
1098 def splitquery(url
):
1099 """splitquery('/path?query') --> '/path', 'query'."""
1101 if _queryprog
is None:
1103 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
1105 match
= _queryprog
.match(url
)
1106 if match
: return match
.group(1, 2)
1111 """splittag('/path#tag') --> '/path', 'tag'."""
1113 if _tagprog
is None:
1115 _tagprog
= re
.compile('^(.*)#([^#]*)$')
1117 match
= _tagprog
.match(url
)
1118 if match
: return match
.group(1, 2)
1122 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1123 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1124 words
= url
.split(';')
1125 return words
[0], words
[1:]
1128 def splitvalue(attr
):
1129 """splitvalue('attr=value') --> 'attr', 'value'."""
1131 if _valueprog
is None:
1133 _valueprog
= re
.compile('^([^=]*)=(.*)$')
1135 match
= _valueprog
.match(attr
)
1136 if match
: return match
.group(1, 2)
1139 _hextochr
= dict(('%02x' % i
, chr(i
)) for i
in range(256))
1140 _hextochr
.update(('%02X' % i
, chr(i
)) for i
in range(256))
1143 """unquote('abc%20def') -> 'abc def'."""
1145 for i
in xrange(1, len(res
)):
1148 res
[i
] = _hextochr
[item
[:2]] + item
[2:]
1151 except UnicodeDecodeError:
1152 res
[i
] = unichr(int(item
[:2], 16)) + item
[2:]
1155 def unquote_plus(s
):
1156 """unquote('%7e/abc+def') -> '~/abc def'"""
1157 s
= s
.replace('+', ' ')
1160 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1161 'abcdefghijklmnopqrstuvwxyz'
1165 def quote(s
, safe
= '/'):
1166 """quote('abc def') -> 'abc%20def'
1168 Each part of a URL, e.g. the path info, the query, etc., has a
1169 different set of reserved characters that must be quoted.
1171 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1172 the following reserved characters.
1174 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1177 Each of these characters is reserved in some component of a URL,
1178 but not necessarily in all of them.
1180 By default, the quote function is intended for quoting the path
1181 section of a URL. Thus, it will not encode '/'. This character
1182 is reserved, but in typical usage the quote function is being
1183 called on a path where the existing slash characters are used as
1184 reserved characters.
1186 cachekey
= (safe
, always_safe
)
1188 safe_map
= _safemaps
[cachekey
]
1192 for i
in range(256):
1194 safe_map
[c
] = (c
in safe
) and c
or ('%%%02X' % i
)
1195 _safemaps
[cachekey
] = safe_map
1196 res
= map(safe_map
.__getitem
__, s
)
1199 def quote_plus(s
, safe
= ''):
1200 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1202 s
= quote(s
, safe
+ ' ')
1203 return s
.replace(' ', '+')
1204 return quote(s
, safe
)
1206 def urlencode(query
,doseq
=0):
1207 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1209 If any values in the query arg are sequences and doseq is true, each
1210 sequence element is converted to a separate parameter.
1212 If the query arg is a sequence of two-element tuples, the order of the
1213 parameters in the output will match the order of parameters in the
1217 if hasattr(query
,"items"):
1219 query
= query
.items()
1221 # it's a bother at times that strings and string-like objects are
1224 # non-sequence items should not work with len()
1225 # non-empty strings will fail this
1226 if len(query
) and not isinstance(query
[0], tuple):
1228 # zero-length sequences of all types will get here and succeed,
1229 # but that's a minor nit - since the original implementation
1230 # allowed empty dicts that type of behavior probably should be
1231 # preserved for consistency
1233 ty
,va
,tb
= sys
.exc_info()
1234 raise TypeError, "not a valid non-string sequence or mapping object", tb
1238 # preserve old behavior
1240 k
= quote_plus(str(k
))
1241 v
= quote_plus(str(v
))
1242 l
.append(k
+ '=' + v
)
1245 k
= quote_plus(str(k
))
1246 if isinstance(v
, str):
1248 l
.append(k
+ '=' + v
)
1249 elif _is_unicode(v
):
1250 # is there a reasonable way to convert to ASCII?
1251 # encode generates a string, but "replace" or "ignore"
1252 # lose information and "strict" can raise UnicodeError
1253 v
= quote_plus(v
.encode("ASCII","replace"))
1254 l
.append(k
+ '=' + v
)
1257 # is this a sufficient test for sequence-ness?
1261 v
= quote_plus(str(v
))
1262 l
.append(k
+ '=' + v
)
1264 # loop over the sequence
1266 l
.append(k
+ '=' + quote_plus(str(elt
)))
1270 def getproxies_environment():
1271 """Return a dictionary of scheme -> proxy server URL mappings.
1273 Scan the environment for variables named <scheme>_proxy;
1274 this seems to be the standard convention. If you need a
1275 different way, you can pass a proxies dictionary to the
1276 [Fancy]URLopener constructor.
1280 for name
, value
in os
.environ
.items():
1282 if value
and name
[-6:] == '_proxy':
1283 proxies
[name
[:-6]] = value
1286 if sys
.platform
== 'darwin':
1287 def getproxies_internetconfig():
1288 """Return a dictionary of scheme -> proxy server URL mappings.
1290 By convention the mac uses Internet Config to store
1291 proxies. An HTTP proxy, for instance, is stored under
1306 if 'UseHTTPProxy' in config
and config
['UseHTTPProxy']:
1308 value
= config
['HTTPProxyHost']
1312 proxies
['http'] = 'http://%s' % value
1313 # FTP: XXXX To be done.
1314 # Gopher: XXXX To be done.
1317 def proxy_bypass(x
):
1321 return getproxies_environment() or getproxies_internetconfig()
1323 elif os
.name
== 'nt':
1324 def getproxies_registry():
1325 """Return a dictionary of scheme -> proxy server URL mappings.
1327 Win32 uses the registry to store proxies.
1334 # Std module, so should be around - but you never know!
1337 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1338 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1339 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1342 # Returned as Unicode but problems if not converted to ASCII
1343 proxyServer
= str(_winreg
.QueryValueEx(internetSettings
,
1345 if '=' in proxyServer
:
1346 # Per-protocol settings
1347 for p
in proxyServer
.split(';'):
1348 protocol
, address
= p
.split('=', 1)
1349 # See if address has a type:// prefix
1351 if not re
.match('^([^/:]+)://', address
):
1352 address
= '%s://%s' % (protocol
, address
)
1353 proxies
[protocol
] = address
1355 # Use one setting for all protocols
1356 if proxyServer
[:5] == 'http:':
1357 proxies
['http'] = proxyServer
1359 proxies
['http'] = 'http://%s' % proxyServer
1360 proxies
['ftp'] = 'ftp://%s' % proxyServer
1361 internetSettings
.Close()
1362 except (WindowsError, ValueError, TypeError):
1363 # Either registry key not found etc, or the value in an
1364 # unexpected format.
1365 # proxies already set up to be empty so nothing to do
1370 """Return a dictionary of scheme -> proxy server URL mappings.
1372 Returns settings gathered from the environment, if specified,
1376 return getproxies_environment() or getproxies_registry()
1378 def proxy_bypass(host
):
1383 # Std modules, so should be around - but you never know!
1386 internetSettings
= _winreg
.OpenKey(_winreg
.HKEY_CURRENT_USER
,
1387 r
'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1388 proxyEnable
= _winreg
.QueryValueEx(internetSettings
,
1390 proxyOverride
= str(_winreg
.QueryValueEx(internetSettings
,
1391 'ProxyOverride')[0])
1392 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1393 except WindowsError:
1395 if not proxyEnable
or not proxyOverride
:
1397 # try to make a host list from name and IP address.
1398 rawHost
, port
= splitport(host
)
1401 addr
= socket
.gethostbyname(rawHost
)
1404 except socket
.error
:
1407 fqdn
= socket
.getfqdn(rawHost
)
1410 except socket
.error
:
1412 # make a check value list from the registry entry: replace the
1413 # '<local>' string by the localhost entry and the corresponding
1415 proxyOverride
= proxyOverride
.split(';')
1417 while i
< len(proxyOverride
):
1418 if proxyOverride
[i
] == '<local>':
1419 proxyOverride
[i
:i
+1] = ['localhost',
1421 socket
.gethostname(),
1422 socket
.gethostbyname(
1423 socket
.gethostname())]
1425 # print proxyOverride
1426 # now check if we match one of the registry values.
1427 for test
in proxyOverride
:
1428 test
= test
.replace(".", r
"\.") # mask dots
1429 test
= test
.replace("*", r
".*") # change glob sequence
1430 test
= test
.replace("?", r
".") # change glob char
1432 # print "%s <--> %s" %( test, val )
1433 if re
.match(test
, val
, re
.I
):
1438 # By default use environment variables
1439 getproxies
= getproxies_environment
1441 def proxy_bypass(host
):
1444 # Test and time quote() and unquote()
1447 for i
in range(256): s
= s
+ chr(i
)
1458 print round(t1
- t0
, 3), 'sec'
1461 def reporthook(blocknum
, blocksize
, totalsize
):
1462 # Report during remote transfers
1463 print "Block number: %d, Block size: %d, Total size: %d" % (
1464 blocknum
, blocksize
, totalsize
)
1472 'file://localhost/etc/passwd',
1473 'ftp://ftp.gnu.org/pub/README',
1474 'http://www.python.org/index.html',
1476 if hasattr(URLopener
, "open_https"):
1477 args
.append('https://synergy.as.cmu.edu/~geek/')
1480 print '-'*10, url
, '-'*10
1481 fn
, h
= urlretrieve(url
, None, reporthook
)
1485 for k
in h
.keys(): print k
+ ':', h
[k
]
1491 table
= string
.maketrans("", "")
1492 data
= data
.translate(table
, "\r")
1502 opts
, args
= getopt
.getopt(sys
.argv
[1:], "th")
1503 except getopt
.error
, msg
:
1505 print "Use -h for help"
1512 print "Usage: python urllib.py [-t] [url ...]"
1513 print "-t runs self-test;",
1514 print "otherwise, contents of urls are printed"
1522 print "Use -h for help"
1524 print urlopen(url
).read(),
1526 # Run test program when run as a script
1527 if __name__
== '__main__':