*quietly adds a file that should have been in the tree since 9839a*
[halbot.git] / urllib2.py
blobd8d556924b1aa830f1c17d0e5176357a614d96f4
1 """An extensible library for opening URLs using a variety of protocols
3 The simplest way to use this module is to call the urlopen function,
4 which accepts a string containing a URL or a Request object (described
5 below). It opens the URL and returns the results as file-like
6 object; the returned object has some extra methods described below.
8 The OpenerDirector manages a collection of Handler objects that do
9 all the actual work. Each Handler implements a particular protocol or
10 option. The OpenerDirector is a composite object that invokes the
11 Handlers needed to open the requested URL. For example, the
12 HTTPHandler performs HTTP GET and POST requests and deals with
13 non-error returns. The HTTPRedirectHandler automatically deals with
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
15 deals with digest authentication.
17 urlopen(url, data=None) -- basic usage is that same as original
18 urllib. pass the url and optionally data to post to an HTTP URL, and
19 get a file-like object back. One difference is that you can also pass
20 a Request instance instead of URL. Raises a URLError (subclass of
21 IOError); for HTTP errors, raises an HTTPError, which can also be
22 treated as a valid response.
24 build_opener -- function that creates a new OpenerDirector instance.
25 will install the default handlers. accepts one or more Handlers as
26 arguments, either instances or Handler classes that it will
27 instantiate. if one of the argument is a subclass of the default
28 handler, the argument will be installed instead of the default.
30 install_opener -- installs a new opener as the default opener.
32 objects of interest:
33 OpenerDirector --
35 Request -- an object that encapsulates the state of a request. the
36 state can be a simple as the URL. it can also include extra HTTP
37 headers, e.g. a User-Agent.
39 BaseHandler --
41 exceptions:
42 URLError-- a subclass of IOError, individual protocols have their own
43 specific subclass
45 HTTPError-- also a valid HTTP response, so you can treat an HTTP error
46 as an exceptional event or valid response
48 internals:
49 BaseHandler and parent
50 _call_chain conventions
52 Example usage:
54 import urllib2
56 # set up authentication info
57 authinfo = urllib2.HTTPBasicAuthHandler()
58 authinfo.add_password('realm', 'host', 'username', 'password')
60 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
62 # build a new opener that adds authentication and caching FTP handlers
63 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
65 # install it
66 urllib2.install_opener(opener)
68 f = urllib2.urlopen('http://www.python.org/')
71 """
73 # XXX issues:
74 # If an authentication error handler that tries to perform
75 # authentication for some reason but fails, how should the error be
76 # signalled? The client needs to know the HTTP error code. But if
77 # the handler knows that the problem was, e.g., that it didn't know
78 # that hash algo that requested in the challenge, it would be good to
79 # pass that information along to the client, too.
81 # XXX to do:
82 # name!
83 # documentation (getting there)
84 # complex proxies
85 # abstract factory for opener
86 # ftp errors aren't handled cleanly
87 # gopher can return a socket.error
88 # check digest against correct (i.e. non-apache) implementation
90 import base64
91 import ftplib
92 import gopherlib
93 import httplib
94 import inspect
95 import md5
96 import mimetypes
97 import mimetools
98 import os
99 import posixpath
100 import random
101 import re
102 import rfc822
103 import sha
104 import socket
105 import sys
106 import time
107 import urlparse
109 try:
110 from cStringIO import StringIO
111 except ImportError:
112 from StringIO import StringIO
114 # not sure how many of these need to be gotten rid of
115 from urllib import unwrap, unquote, splittype, splithost, \
116 addinfourl, splitport, splitgophertype, splitquery, \
117 splitattr, ftpwrapper, noheaders, splituser, splitpasswd
119 # support for FileHandler, proxies via environment variables
120 from urllib import localhost, url2pathname, getproxies
122 __version__ = "2.1"
124 _opener = None
125 def urlopen(url, data=None):
126 global _opener
127 if _opener is None:
128 _opener = build_opener()
129 return _opener.open(url, data)
131 def install_opener(opener):
132 global _opener
133 _opener = opener
135 # do these error classes make sense?
136 # make sure all of the IOError stuff is overridden. we just want to be
137 # subtypes.
139 class URLError(IOError):
140 # URLError is a sub-type of IOError, but it doesn't share any of
141 # the implementation. need to override __init__ and __str__
142 def __init__(self, reason):
143 self.reason = reason
145 def __str__(self):
146 return '<urlopen error %s>' % self.reason
148 class HTTPError(URLError, addinfourl):
149 """Raised when HTTP error occurs, but also acts like non-error return"""
150 __super_init = addinfourl.__init__
152 def __init__(self, url, code, msg, hdrs, fp):
153 self.code = code
154 self.msg = msg
155 self.hdrs = hdrs
156 self.fp = fp
157 self.filename = url
158 # The addinfourl classes depend on fp being a valid file
159 # object. In some cases, the HTTPError may not have a valid
160 # file object. If this happens, the simplest workaround is to
161 # not initialize the base classes.
162 if fp is not None:
163 self.__super_init(fp, hdrs, url)
165 def __str__(self):
166 return 'HTTP Error %s: %s' % (self.code, self.msg)
168 def __del__(self):
169 # XXX is this safe? what if user catches exception, then
170 # extracts fp and discards exception?
171 if self.fp:
172 self.fp.close()
174 class GopherError(URLError):
175 pass
178 class Request:
180 def __init__(self, url, data=None, headers={}):
181 # unwrap('<URL:type://host/path>') --> 'type://host/path'
182 self.__original = unwrap(url)
183 self.type = None
184 # self.__r_type is what's left after doing the splittype
185 self.host = None
186 self.port = None
187 self.data = data
188 self.headers = {}
189 for key, value in headers.items():
190 self.add_header(key, value)
192 def __getattr__(self, attr):
193 # XXX this is a fallback mechanism to guard against these
194 # methods getting called in a non-standard order. this may be
195 # too complicated and/or unnecessary.
196 # XXX should the __r_XXX attributes be public?
197 if attr[:12] == '_Request__r_':
198 name = attr[12:]
199 if hasattr(Request, 'get_' + name):
200 getattr(self, 'get_' + name)()
201 return getattr(self, attr)
202 raise AttributeError, attr
204 def get_method(self):
205 if self.has_data():
206 return "POST"
207 else:
208 return "GET"
210 def add_data(self, data):
211 self.data = data
213 def has_data(self):
214 return self.data is not None
216 def get_data(self):
217 return self.data
219 def get_full_url(self):
220 return self.__original
222 def get_type(self):
223 if self.type is None:
224 self.type, self.__r_type = splittype(self.__original)
225 if self.type is None:
226 raise ValueError, "unknown url type: %s" % self.__original
227 return self.type
229 def get_host(self):
230 if self.host is None:
231 self.host, self.__r_host = splithost(self.__r_type)
232 if self.host:
233 self.host = unquote(self.host)
234 return self.host
236 def get_selector(self):
237 return self.__r_host
239 def set_proxy(self, host, type):
240 self.host, self.type = host, type
241 self.__r_host = self.__original
243 def add_header(self, key, val):
244 # useful for something like authentication
245 self.headers[key.capitalize()] = val
247 class OpenerDirector:
248 def __init__(self):
249 #server_version = "Python-urllib/%s" % __version__
250 self.addheaders = [('User-agent', "Halbot-0.1")]
251 # manage the individual handlers
252 self.handlers = []
253 self.handle_open = {}
254 self.handle_error = {}
256 def add_handler(self, handler):
257 added = 0
258 for meth in dir(handler):
259 if meth[-5:] == '_open':
260 protocol = meth[:-5]
261 if protocol in self.handle_open:
262 self.handle_open[protocol].append(handler)
263 self.handle_open[protocol].sort()
264 else:
265 self.handle_open[protocol] = [handler]
266 added = 1
267 continue
268 i = meth.find('_')
269 j = meth[i+1:].find('_') + i + 1
270 if j != -1 and meth[i+1:j] == 'error':
271 proto = meth[:i]
272 kind = meth[j+1:]
273 try:
274 kind = int(kind)
275 except ValueError:
276 pass
277 dict = self.handle_error.get(proto, {})
278 if kind in dict:
279 dict[kind].append(handler)
280 dict[kind].sort()
281 else:
282 dict[kind] = [handler]
283 self.handle_error[proto] = dict
284 added = 1
285 continue
286 if added:
287 self.handlers.append(handler)
288 self.handlers.sort()
289 handler.add_parent(self)
291 def __del__(self):
292 self.close()
294 def close(self):
295 for handler in self.handlers:
296 handler.close()
297 self.handlers = []
299 def _call_chain(self, chain, kind, meth_name, *args):
300 # XXX raise an exception if no one else should try to handle
301 # this url. return None if you can't but someone else could.
302 handlers = chain.get(kind, ())
303 for handler in handlers:
304 func = getattr(handler, meth_name)
306 result = func(*args)
307 if result is not None:
308 return result
310 def open(self, fullurl, data=None):
311 # accept a URL or a Request object
312 if isinstance(fullurl, basestring):
313 req = Request(fullurl, data)
314 else:
315 req = fullurl
316 if data is not None:
317 req.add_data(data)
319 result = self._call_chain(self.handle_open, 'default',
320 'default_open', req)
321 if result:
322 return result
324 type_ = req.get_type()
325 result = self._call_chain(self.handle_open, type_, type_ + \
326 '_open', req)
327 if result:
328 return result
330 return self._call_chain(self.handle_open, 'unknown',
331 'unknown_open', req)
333 def error(self, proto, *args):
334 if proto in ['http', 'https']:
335 # XXX http[s] protocols are special-cased
336 dict = self.handle_error['http'] # https is not different than http
337 proto = args[2] # YUCK!
338 meth_name = 'http_error_%d' % proto
339 http_err = 1
340 orig_args = args
341 else:
342 dict = self.handle_error
343 meth_name = proto + '_error'
344 http_err = 0
345 args = (dict, proto, meth_name) + args
346 result = self._call_chain(*args)
347 if result:
348 return result
350 if http_err:
351 args = (dict, 'default', 'http_error_default') + orig_args
352 return self._call_chain(*args)
354 # XXX probably also want an abstract factory that knows when it makes
355 # sense to skip a superclass in favor of a subclass and when it might
356 # make sense to include both
358 def build_opener(*handlers):
359 """Create an opener object from a list of handlers.
361 The opener will use several default handlers, including support
362 for HTTP and FTP.
364 If any of the handlers passed as arguments are subclasses of the
365 default handlers, the default handlers will not be used.
368 opener = OpenerDirector()
369 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
370 HTTPDefaultErrorHandler, HTTPRedirectHandler,
371 FTPHandler, FileHandler]
372 if hasattr(httplib, 'HTTPS'):
373 default_classes.append(HTTPSHandler)
374 skip = []
375 for klass in default_classes:
376 for check in handlers:
377 if inspect.isclass(check):
378 if issubclass(check, klass):
379 skip.append(klass)
380 elif isinstance(check, klass):
381 skip.append(klass)
382 for klass in skip:
383 default_classes.remove(klass)
385 for klass in default_classes:
386 opener.add_handler(klass())
388 for h in handlers:
389 if inspect.isclass(h):
390 h = h()
391 opener.add_handler(h)
392 return opener
394 class BaseHandler:
395 handler_order = 500
397 def add_parent(self, parent):
398 self.parent = parent
399 def close(self):
400 self.parent = None
401 def __lt__(self, other):
402 if not hasattr(other, "handler_order"):
403 # Try to preserve the old behavior of having custom classes
404 # inserted after default ones (works only for custom user
405 # classes which are not aware of handler_order).
406 return True
407 return self.handler_order < other.handler_order
410 class HTTPDefaultErrorHandler(BaseHandler):
411 def http_error_default(self, req, fp, code, msg, hdrs):
412 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
414 class HTTPRedirectHandler(BaseHandler):
415 def redirect_request(self, req, fp, code, msg, headers, newurl):
416 """Return a Request or None in response to a redirect.
418 This is called by the http_error_30x methods when a
419 redirection response is received. If a redirection should
420 take place, return a new Request to allow http_error_30x to
421 perform the redirect. Otherwise, raise HTTPError if no-one
422 else should try to handle this url. Return None if you can't
423 but another Handler might.
425 m = req.get_method()
426 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
427 or code in (301, 302, 303) and m == "POST"):
428 # Strictly (according to RFC 2616), 301 or 302 in response
429 # to a POST MUST NOT cause a redirection without confirmation
430 # from the user (of urllib2, in this case). In practice,
431 # essentially all clients do redirect in this case, so we
432 # do the same.
433 return Request(newurl, headers=req.headers)
434 else:
435 raise HTTPError(req.get_full_url(), code, msg, headers, fp)
437 # Implementation note: To avoid the server sending us into an
438 # infinite loop, the request object needs to track what URLs we
439 # have already seen. Do this by adding a handler-specific
440 # attribute to the Request object.
441 def http_error_302(self, req, fp, code, msg, headers):
442 if 'location' in headers:
443 newurl = headers['location']
444 elif 'uri' in headers:
445 newurl = headers['uri']
446 else:
447 return
448 newurl = urlparse.urljoin(req.get_full_url(), newurl)
450 # XXX Probably want to forget about the state of the current
451 # request, although that might interact poorly with other
452 # handlers that also use handler-specific request attributes
453 new = self.redirect_request(req, fp, code, msg, headers, newurl)
454 if new is None:
455 return
457 # loop detection
458 new.error_302_dict = {}
459 if hasattr(req, 'error_302_dict'):
460 if len(req.error_302_dict)>10 or \
461 newurl in req.error_302_dict:
462 raise HTTPError(req.get_full_url(), code,
463 self.inf_msg + msg, headers, fp)
464 new.error_302_dict.update(req.error_302_dict)
465 new.error_302_dict[newurl] = newurl
467 # Don't close the fp until we are sure that we won't use it
468 # with HTTPError.
469 fp.read()
470 fp.close()
472 return self.parent.open(new)
474 http_error_301 = http_error_303 = http_error_307 = http_error_302
476 inf_msg = "The HTTP server returned a redirect error that would " \
477 "lead to an infinite loop.\n" \
478 "The last 30x error message was:\n"
480 class ProxyHandler(BaseHandler):
481 # Proxies must be in front
482 handler_order = 100
484 def __init__(self, proxies=None):
485 if proxies is None:
486 proxies = getproxies()
487 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
488 self.proxies = proxies
489 for type, url in proxies.items():
490 setattr(self, '%s_open' % type,
491 lambda r, proxy=url, type=type, meth=self.proxy_open: \
492 meth(r, proxy, type))
494 def proxy_open(self, req, proxy, type):
495 orig_type = req.get_type()
496 type, r_type = splittype(proxy)
497 host, XXX = splithost(r_type)
498 if '@' in host:
499 user_pass, host = host.split('@', 1)
500 if ':' in user_pass:
501 user, password = user_pass.split(':', 1)
502 user_pass = base64.encodestring('%s:%s' % (unquote(user),
503 unquote(password)))
504 req.add_header('Proxy-authorization', 'Basic ' + user_pass)
505 host = unquote(host)
506 req.set_proxy(host, type)
507 if orig_type == type:
508 # let other handlers take care of it
509 # XXX this only makes sense if the proxy is before the
510 # other handlers
511 return None
512 else:
513 # need to start over, because the other handlers don't
514 # grok the proxy's URL type
515 return self.parent.open(req)
517 # feature suggested by Duncan Booth
518 # XXX custom is not a good name
519 class CustomProxy:
520 # either pass a function to the constructor or override handle
521 def __init__(self, proto, func=None, proxy_addr=None):
522 self.proto = proto
523 self.func = func
524 self.addr = proxy_addr
526 def handle(self, req):
527 if self.func and self.func(req):
528 return 1
530 def get_proxy(self):
531 return self.addr
533 class CustomProxyHandler(BaseHandler):
534 # Proxies must be in front
535 handler_order = 100
537 def __init__(self, *proxies):
538 self.proxies = {}
540 def proxy_open(self, req):
541 proto = req.get_type()
542 try:
543 proxies = self.proxies[proto]
544 except KeyError:
545 return None
546 for p in proxies:
547 if p.handle(req):
548 req.set_proxy(p.get_proxy())
549 return self.parent.open(req)
550 return None
552 def do_proxy(self, p, req):
553 return self.parent.open(req)
555 def add_proxy(self, cpo):
556 if cpo.proto in self.proxies:
557 self.proxies[cpo.proto].append(cpo)
558 else:
559 self.proxies[cpo.proto] = [cpo]
561 class HTTPPasswordMgr:
562 def __init__(self):
563 self.passwd = {}
565 def add_password(self, realm, uri, user, passwd):
566 # uri could be a single URI or a sequence
567 if isinstance(uri, basestring):
568 uri = [uri]
569 uri = tuple(map(self.reduce_uri, uri))
570 if not realm in self.passwd:
571 self.passwd[realm] = {}
572 self.passwd[realm][uri] = (user, passwd)
574 def find_user_password(self, realm, authuri):
575 domains = self.passwd.get(realm, {})
576 authuri = self.reduce_uri(authuri)
577 for uris, authinfo in domains.iteritems():
578 for uri in uris:
579 if self.is_suburi(uri, authuri):
580 return authinfo
581 return None, None
583 def reduce_uri(self, uri):
584 """Accept netloc or URI and extract only the netloc and path"""
585 parts = urlparse.urlparse(uri)
586 if parts[1]:
587 return parts[1], parts[2] or '/'
588 else:
589 return parts[2], '/'
591 def is_suburi(self, base, test):
592 """Check if test is below base in a URI tree
594 Both args must be URIs in reduced form.
596 if base == test:
597 return True
598 if base[0] != test[0]:
599 return False
600 common = posixpath.commonprefix((base[1], test[1]))
601 if len(common) == len(base[1]):
602 return True
603 return False
606 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
608 def find_user_password(self, realm, authuri):
609 user, password = HTTPPasswordMgr.find_user_password(self, realm,
610 authuri)
611 if user is not None:
612 return user, password
613 return HTTPPasswordMgr.find_user_password(self, None, authuri)
616 class AbstractBasicAuthHandler:
618 rx = re.compile('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', re.I)
620 # XXX there can actually be multiple auth-schemes in a
621 # www-authenticate header. should probably be a lot more careful
622 # in parsing them to extract multiple alternatives
624 def __init__(self, password_mgr=None):
625 if password_mgr is None:
626 password_mgr = HTTPPasswordMgr()
627 self.passwd = password_mgr
628 self.add_password = self.passwd.add_password
630 def http_error_auth_reqed(self, authreq, host, req, headers):
631 # XXX could be multiple headers
632 authreq = headers.get(authreq, None)
633 if authreq:
634 mo = AbstractBasicAuthHandler.rx.match(authreq)
635 if mo:
636 scheme, realm = mo.groups()
637 if scheme.lower() == 'basic':
638 return self.retry_http_basic_auth(host, req, realm)
640 def retry_http_basic_auth(self, host, req, realm):
641 user,pw = self.passwd.find_user_password(realm, host)
642 if pw is not None:
643 raw = "%s:%s" % (user, pw)
644 auth = 'Basic %s' % base64.encodestring(raw).strip()
645 if req.headers.get(self.auth_header, None) == auth:
646 return None
647 req.add_header(self.auth_header, auth)
648 return self.parent.open(req)
649 else:
650 return None
652 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
654 auth_header = 'Authorization'
656 def http_error_401(self, req, fp, code, msg, headers):
657 host = urlparse.urlparse(req.get_full_url())[1]
658 return self.http_error_auth_reqed('www-authenticate',
659 host, req, headers)
662 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
664 auth_header = 'Proxy-authorization'
666 def http_error_407(self, req, fp, code, msg, headers):
667 host = req.get_host()
668 return self.http_error_auth_reqed('proxy-authenticate',
669 host, req, headers)
672 def randombytes(n):
673 """Return n random bytes."""
674 # Use /dev/urandom if it is available. Fall back to random module
675 # if not. It might be worthwhile to extend this function to use
676 # other platform-specific mechanisms for getting random bytes.
677 if os.path.exists("/dev/urandom"):
678 f = open("/dev/urandom")
679 s = f.read(n)
680 f.close()
681 return s
682 else:
683 L = [chr(random.randrange(0, 256)) for i in range(n)]
684 return "".join(L)
686 class AbstractDigestAuthHandler:
687 # Digest authentication is specified in RFC 2617.
689 # XXX The client does not inspect the Authentication-Info header
690 # in a successful response.
692 # XXX It should be possible to test this implementation against
693 # a mock server that just generates a static set of challenges.
695 # XXX qop="auth-int" supports is shaky
697 def __init__(self, passwd=None):
698 if passwd is None:
699 passwd = HTTPPasswordMgr()
700 self.passwd = passwd
701 self.add_password = self.passwd.add_password
702 self.retried = 0
703 self.nonce_count = 0
705 def reset_retry_count(self):
706 self.retried = 0
708 def http_error_auth_reqed(self, auth_header, host, req, headers):
709 authreq = headers.get(auth_header, None)
710 if self.retried > 5:
711 # Don't fail endlessly - if we failed once, we'll probably
712 # fail a second time. Hm. Unless the Password Manager is
713 # prompting for the information. Crap. This isn't great
714 # but it's better than the current 'repeat until recursion
715 # depth exceeded' approach <wink>
716 raise HTTPError(req.get_full_url(), 401, "digest auth failed",
717 headers, None)
718 else:
719 self.retried += 1
720 if authreq:
721 scheme = authreq.split()[0]
722 if scheme.lower() == 'digest':
723 return self.retry_http_digest_auth(req, authreq)
724 else:
725 raise ValueError("AbstractDigestAuthHandler doesn't know "
726 "about %s"%(scheme))
728 def retry_http_digest_auth(self, req, auth):
729 token, challenge = auth.split(' ', 1)
730 chal = parse_keqv_list(parse_http_list(challenge))
731 auth = self.get_authorization(req, chal)
732 if auth:
733 auth_val = 'Digest %s' % auth
734 if req.headers.get(self.auth_header, None) == auth_val:
735 return None
736 req.add_header(self.auth_header, auth_val)
737 resp = self.parent.open(req)
738 return resp
740 def get_cnonce(self, nonce):
741 # The cnonce-value is an opaque
742 # quoted string value provided by the client and used by both client
743 # and server to avoid chosen plaintext attacks, to provide mutual
744 # authentication, and to provide some message integrity protection.
745 # This isn't a fabulous effort, but it's probably Good Enough.
746 dig = sha.new("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
747 randombytes(8))).hexdigest()
748 return dig[:16]
750 def get_authorization(self, req, chal):
751 try:
752 realm = chal['realm']
753 nonce = chal['nonce']
754 qop = chal.get('qop')
755 algorithm = chal.get('algorithm', 'MD5')
756 # mod_digest doesn't send an opaque, even though it isn't
757 # supposed to be optional
758 opaque = chal.get('opaque', None)
759 except KeyError:
760 return None
762 H, KD = self.get_algorithm_impls(algorithm)
763 if H is None:
764 return None
766 user, pw = self.passwd.find_user_password(realm, req.get_full_url())
767 if user is None:
768 return None
770 # XXX not implemented yet
771 if req.has_data():
772 entdig = self.get_entity_digest(req.get_data(), chal)
773 else:
774 entdig = None
776 A1 = "%s:%s:%s" % (user, realm, pw)
777 A2 = "%s:%s" % (req.has_data() and 'POST' or 'GET',
778 # XXX selector: what about proxies and full urls
779 req.get_selector())
780 if qop == 'auth':
781 self.nonce_count += 1
782 ncvalue = '%08x' % self.nonce_count
783 cnonce = self.get_cnonce(nonce)
784 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
785 respdig = KD(H(A1), noncebit)
786 elif qop is None:
787 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
788 else:
789 # XXX handle auth-int.
790 pass
792 # XXX should the partial digests be encoded too?
794 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
795 'response="%s"' % (user, realm, nonce, req.get_selector(),
796 respdig)
797 if opaque:
798 base = base + ', opaque="%s"' % opaque
799 if entdig:
800 base = base + ', digest="%s"' % entdig
801 if algorithm != 'MD5':
802 base = base + ', algorithm="%s"' % algorithm
803 if qop:
804 base = base + ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
805 return base
807 def get_algorithm_impls(self, algorithm):
808 # lambdas assume digest modules are imported at the top level
809 if algorithm == 'MD5':
810 H = lambda x: md5.new(x).hexdigest()
811 elif algorithm == 'SHA':
812 H = lambda x: sha.new(x).hexdigest()
813 # XXX MD5-sess
814 KD = lambda s, d: H("%s:%s" % (s, d))
815 return H, KD
817 def get_entity_digest(self, data, chal):
818 # XXX not implemented yet
819 return None
822 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
823 """An authentication protocol defined by RFC 2069
825 Digest authentication improves on basic authentication because it
826 does not transmit passwords in the clear.
829 auth_header = 'Authorization'
831 def http_error_401(self, req, fp, code, msg, headers):
832 host = urlparse.urlparse(req.get_full_url())[1]
833 retry = self.http_error_auth_reqed('www-authenticate',
834 host, req, headers)
835 self.reset_retry_count()
836 return retry
839 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
841 auth_header = 'Proxy-Authorization'
843 def http_error_407(self, req, fp, code, msg, headers):
844 host = req.get_host()
845 retry = self.http_error_auth_reqed('proxy-authenticate',
846 host, req, headers)
847 self.reset_retry_count()
848 return retry
850 class AbstractHTTPHandler(BaseHandler):
852 # XXX Should rewrite do_open() to use the new httplib interface,
853 # would be a little simpler.
855 def do_open(self, http_class, req):
856 host = req.get_host()
857 if not host:
858 raise URLError('no host given')
860 h = http_class(host) # will parse host:port
861 if req.has_data():
862 data = req.get_data()
863 h.putrequest('POST', req.get_selector())
864 if not 'Content-type' in req.headers:
865 h.putheader('Content-type',
866 'application/x-www-form-urlencoded')
867 if not 'Content-length' in req.headers:
868 h.putheader('Content-length', '%d' % len(data))
869 else:
870 h.putrequest('GET', req.get_selector())
872 scheme, sel = splittype(req.get_selector())
873 sel_host, sel_path = splithost(sel)
874 h.putheader('Host', sel_host or host)
875 for name, value in self.parent.addheaders:
876 name = name.capitalize()
877 if name not in req.headers:
878 h.putheader(name, value)
879 for k, v in req.headers.items():
880 h.putheader(k, v)
881 # httplib will attempt to connect() here. be prepared
882 # to convert a socket error to a URLError.
883 try:
884 h.endheaders()
885 except socket.error, err:
886 raise URLError(err)
887 if req.has_data():
888 h.send(data)
890 code, msg, hdrs = h.getreply()
891 fp = h.getfile()
892 if code == 200:
893 return addinfourl(fp, hdrs, req.get_full_url())
894 else:
895 return self.parent.error('http', req, fp, code, msg, hdrs)
898 class HTTPHandler(AbstractHTTPHandler):
900 def http_open(self, req):
901 return self.do_open(httplib.HTTP, req)
904 if hasattr(httplib, 'HTTPS'):
905 class HTTPSHandler(AbstractHTTPHandler):
907 def https_open(self, req):
908 return self.do_open(httplib.HTTPS, req)
911 class UnknownHandler(BaseHandler):
912 def unknown_open(self, req):
913 type = req.get_type()
914 raise URLError('unknown url type: %s' % type)
916 def parse_keqv_list(l):
917 """Parse list of key=value strings where keys are not duplicated."""
918 parsed = {}
919 for elt in l:
920 k, v = elt.split('=', 1)
921 if v[0] == '"' and v[-1] == '"':
922 v = v[1:-1]
923 parsed[k] = v
924 return parsed
926 def parse_http_list(s):
927 """Parse lists as described by RFC 2068 Section 2.
929 In particular, parse comman-separated lists where the elements of
930 the list may include quoted-strings. A quoted-string could
931 contain a comma.
933 # XXX this function could probably use more testing
935 list = []
936 end = len(s)
937 i = 0
938 inquote = 0
939 start = 0
940 while i < end:
941 cur = s[i:]
942 c = cur.find(',')
943 q = cur.find('"')
944 if c == -1:
945 list.append(s[start:])
946 break
947 if q == -1:
948 if inquote:
949 raise ValueError, "unbalanced quotes"
950 else:
951 list.append(s[start:i+c])
952 i = i + c + 1
953 continue
954 if inquote:
955 if q < c:
956 list.append(s[start:i+c])
957 i = i + c + 1
958 start = i
959 inquote = 0
960 else:
961 i = i + q
962 else:
963 if c < q:
964 list.append(s[start:i+c])
965 i = i + c + 1
966 start = i
967 else:
968 inquote = 1
969 i = i + q + 1
970 return map(lambda x: x.strip(), list)
972 class FileHandler(BaseHandler):
973 # Use local file or FTP depending on form of URL
974 def file_open(self, req):
975 url = req.get_selector()
976 if url[:2] == '//' and url[2:3] != '/':
977 req.type = 'ftp'
978 return self.parent.open(req)
979 else:
980 return self.open_local_file(req)
982 # names for the localhost
983 names = None
984 def get_names(self):
985 if FileHandler.names is None:
986 FileHandler.names = (socket.gethostbyname('localhost'),
987 socket.gethostbyname(socket.gethostname()))
988 return FileHandler.names
990 # not entirely sure what the rules are here
991 def open_local_file(self, req):
992 host = req.get_host()
993 file = req.get_selector()
994 localfile = url2pathname(file)
995 stats = os.stat(localfile)
996 size = stats.st_size
997 modified = rfc822.formatdate(stats.st_mtime)
998 mtype = mimetypes.guess_type(file)[0]
999 headers = mimetools.Message(StringIO(
1000 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
1001 (mtype or 'text/plain', size, modified)))
1002 if host:
1003 host, port = splitport(host)
1004 if not host or \
1005 (not port and socket.gethostbyname(host) in self.get_names()):
1006 return addinfourl(open(localfile, 'rb'),
1007 headers, 'file:'+file)
1008 raise URLError('file not on local host')
1010 class FTPHandler(BaseHandler):
1011 def ftp_open(self, req):
1012 host = req.get_host()
1013 if not host:
1014 raise IOError, ('ftp error', 'no host given')
1015 host, port = splitport(host)
1016 if port is None:
1017 port = ftplib.FTP_PORT
1019 # username/password handling
1020 user, host = splituser(host)
1021 if user:
1022 user, passwd = splitpasswd(user)
1023 else:
1024 passwd = None
1025 host = unquote(host)
1026 user = unquote(user or '')
1027 passwd = unquote(passwd or '')
1029 try:
1030 host = socket.gethostbyname(host)
1031 except socket.error, msg:
1032 raise URLError(msg)
1033 path, attrs = splitattr(req.get_selector())
1034 dirs = path.split('/')
1035 dirs = map(unquote, dirs)
1036 dirs, file = dirs[:-1], dirs[-1]
1037 if dirs and not dirs[0]:
1038 dirs = dirs[1:]
1039 try:
1040 fw = self.connect_ftp(user, passwd, host, port, dirs)
1041 type = file and 'I' or 'D'
1042 for attr in attrs:
1043 attr, value = splitattr(attr)
1044 if attr.lower() == 'type' and \
1045 value in ('a', 'A', 'i', 'I', 'd', 'D'):
1046 type = value.upper()
1047 fp, retrlen = fw.retrfile(file, type)
1048 headers = ""
1049 mtype = mimetypes.guess_type(req.get_full_url())[0]
1050 if mtype:
1051 headers += "Content-type: %s\n" % mtype
1052 if retrlen is not None and retrlen >= 0:
1053 headers += "Content-length: %d\n" % retrlen
1054 sf = StringIO(headers)
1055 headers = mimetools.Message(sf)
1056 return addinfourl(fp, headers, req.get_full_url())
1057 except ftplib.all_errors, msg:
1058 raise IOError, ('ftp error', msg), sys.exc_info()[2]
1060 def connect_ftp(self, user, passwd, host, port, dirs):
1061 fw = ftpwrapper(user, passwd, host, port, dirs)
1062 ## fw.ftp.set_debuglevel(1)
1063 return fw
1065 class CacheFTPHandler(FTPHandler):
1066 # XXX would be nice to have pluggable cache strategies
1067 # XXX this stuff is definitely not thread safe
1068 def __init__(self):
1069 self.cache = {}
1070 self.timeout = {}
1071 self.soonest = 0
1072 self.delay = 60
1073 self.max_conns = 16
1075 def setTimeout(self, t):
1076 self.delay = t
1078 def setMaxConns(self, m):
1079 self.max_conns = m
1081 def connect_ftp(self, user, passwd, host, port, dirs):
1082 key = user, host, port, '/'.join(dirs)
1083 if key in self.cache:
1084 self.timeout[key] = time.time() + self.delay
1085 else:
1086 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs)
1087 self.timeout[key] = time.time() + self.delay
1088 self.check_cache()
1089 return self.cache[key]
1091 def check_cache(self):
1092 # first check for old ones
1093 t = time.time()
1094 if self.soonest <= t:
1095 for k, v in self.timeout.items():
1096 if v < t:
1097 self.cache[k].close()
1098 del self.cache[k]
1099 del self.timeout[k]
1100 self.soonest = min(self.timeout.values())
1102 # then check the size
1103 if len(self.cache) == self.max_conns:
1104 for k, v in self.timeout.items():
1105 if v == self.soonest:
1106 del self.cache[k]
1107 del self.timeout[k]
1108 break
1109 self.soonest = min(self.timeout.values())
1111 class GopherHandler(BaseHandler):
1112 def gopher_open(self, req):
1113 host = req.get_host()
1114 if not host:
1115 raise GopherError('no host given')
1116 host = unquote(host)
1117 selector = req.get_selector()
1118 type, selector = splitgophertype(selector)
1119 selector, query = splitquery(selector)
1120 selector = unquote(selector)
1121 if query:
1122 query = unquote(query)
1123 fp = gopherlib.send_query(selector, query, host)
1124 else:
1125 fp = gopherlib.send_selector(selector, host)
1126 return addinfourl(fp, noheaders(), req.get_full_url())
1128 #bleck! don't use this yet
1129 class OpenerFactory:
1131 default_handlers = [UnknownHandler, HTTPHandler,
1132 HTTPDefaultErrorHandler, HTTPRedirectHandler,
1133 FTPHandler, FileHandler]
1134 handlers = []
1135 replacement_handlers = []
1137 def add_handler(self, h):
1138 self.handlers = self.handlers + [h]
1140 def replace_handler(self, h):
1141 pass
1143 def build_opener(self):
1144 opener = OpenerDirector()
1145 for ph in self.default_handlers:
1146 if inspect.isclass(ph):
1147 ph = ph()
1148 opener.add_handler(ph)
1150 if __name__ == "__main__":
1151 # XXX some of the test code depends on machine configurations that
1152 # are internal to CNRI. Need to set up a public server with the
1153 # right authentication configuration for test purposes.
1154 if socket.gethostname() == 'bitdiddle':
1155 localhost = 'bitdiddle.cnri.reston.va.us'
1156 elif socket.gethostname() == 'bitdiddle.concentric.net':
1157 localhost = 'localhost'
1158 else:
1159 localhost = None
1160 urls = [
1161 # Thanks to Fred for finding these!
1162 'gopher://gopher.lib.ncsu.edu/11/library/stacks/Alex',
1163 'gopher://gopher.vt.edu:10010/10/33',
1165 'file:/etc/passwd',
1166 'file://nonsensename/etc/passwd',
1167 'ftp://www.python.org/pub/python/misc/sousa.au',
1168 'ftp://www.python.org/pub/tmp/blat',
1169 'http://www.espn.com/', # redirect
1170 'http://www.python.org/Spanish/Inquistion/',
1171 ('http://www.python.org/cgi-bin/faqw.py',
1172 'query=pythonistas&querytype=simple&casefold=yes&req=search'),
1173 'http://www.python.org/',
1174 'ftp://gatekeeper.research.compaq.com/pub/DEC/SRC/research-reports/00README-Legal-Rules-Regs',
1177 ## if localhost is not None:
1178 ## urls = urls + [
1179 ## 'file://%s/etc/passwd' % localhost,
1180 ## 'http://%s/simple/' % localhost,
1181 ## 'http://%s/digest/' % localhost,
1182 ## 'http://%s/not/found.h' % localhost,
1183 ## ]
1185 ## bauth = HTTPBasicAuthHandler()
1186 ## bauth.add_password('basic_test_realm', localhost, 'jhylton',
1187 ## 'password')
1188 ## dauth = HTTPDigestAuthHandler()
1189 ## dauth.add_password('digest_test_realm', localhost, 'jhylton',
1190 ## 'password')
1193 cfh = CacheFTPHandler()
1194 cfh.setTimeout(1)
1196 ## # XXX try out some custom proxy objects too!
1197 ## def at_cnri(req):
1198 ## host = req.get_host()
1199 ## print host
1200 ## if host[-18:] == '.cnri.reston.va.us':
1201 ## return 1
1202 ## p = CustomProxy('http', at_cnri, 'proxy.cnri.reston.va.us')
1203 ## ph = CustomProxyHandler(p)
1205 ## install_opener(build_opener(dauth, bauth, cfh, GopherHandler, ph))
1206 install_opener(build_opener(cfh, GopherHandler))
1208 for url in urls:
1209 if isinstance(url, tuple):
1210 url, req = url
1211 else:
1212 req = None
1213 print url
1214 try:
1215 f = urlopen(url, req)
1216 except IOError, err:
1217 print "IOError:", err
1218 except socket.error, err:
1219 print "socket.error:", err
1220 else:
1221 buf = f.read()
1222 f.close()
1223 print "read %d bytes" % len(buf)
1224 print
1225 time.sleep(0.1)